diff --git a/.github/actions/veristat_baseline_compare/action.yml b/.github/actions/veristat_baseline_compare/action.yml new file mode 100644 index 0000000000000..4a24166a97a9a --- /dev/null +++ b/.github/actions/veristat_baseline_compare/action.yml @@ -0,0 +1,49 @@ +name: 'run-veristat' +description: 'Run veristat benchmark' +inputs: + veristat_output: + description: 'Veristat output filepath' + required: true + baseline_name: + description: 'Veristat baseline cache name' + required: true +runs: + using: "composite" + steps: + - uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.baseline_name }} + if-no-files-found: error + path: ${{ github.workspace }}/${{ inputs.veristat_output }} + + # For pull request: + # - get baseline log from cache + # - compare it to current run + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/cache/restore@v4 + with: + key: ${{ inputs.baseline_name }}-${{ github.base_ref }} + restore-keys: | + ${{ inputs.baseline_name }}- + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' + + - if: ${{ github.event_name == 'pull_request' }} + name: Show veristat comparison + shell: bash + run: ./.github/scripts/compare-veristat-results.sh + env: + BASELINE_PATH: ${{ github.workspace }}/${{ inputs.baseline_name }} + VERISTAT_OUTPUT: ${{ inputs.veristat_output }} + + # For push: just put baseline log to cache + - if: ${{ github.event_name == 'push' }} + shell: bash + run: | + mv "${{ github.workspace }}/${{ inputs.veristat_output }}" \ + "${{ github.workspace }}/${{ inputs.baseline_name }}" + + - if: ${{ github.event_name == 'push' }} + uses: actions/cache/save@v4 + with: + key: ${{ inputs.baseline_name }}-${{ github.ref_name }}-${{ github.run_id }} + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' diff --git a/.github/scripts/compare-veristat-results.sh b/.github/scripts/compare-veristat-results.sh new file mode 100755 index 0000000000000..f95c3c192d80d --- /dev/null +++ b/.github/scripts/compare-veristat-results.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ ! -f "${BASELINE_PATH}" ]]; then + echo "# No ${BASELINE_PATH} available" >> "${GITHUB_STEP_SUMMARY}" + + echo "No ${BASELINE_PATH} available" + echo "Printing veristat results" + cat "${VERISTAT_OUTPUT}" + + exit +fi + +selftests/bpf/veristat \ + --output-format csv \ + --emit file,prog,verdict,states \ + --compare "${BASELINE_PATH}" "${VERISTAT_OUTPUT}" > compare.csv + +python3 ./.github/scripts/veristat_compare.py compare.csv diff --git a/.github/scripts/download-gcc-bpf.sh b/.github/scripts/download-gcc-bpf.sh new file mode 100755 index 0000000000000..894584a01b2ec --- /dev/null +++ b/.github/scripts/download-gcc-bpf.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +GCC_BPF_RELEASE_GH_REPO=$1 +INSTALL_DIR=$(realpath $2) + +cd /tmp + +tag=$(gh release list -L 1 -R ${GCC_BPF_RELEASE_GH_REPO} --json tagName -q .[].tagName) +if [[ -z "$tag" ]]; then + echo "Could not find latest GCC BPF release at ${GCC_BPF_RELEASE_GH_REPO}" + exit 1 +fi + +url="https://github.com/${GCC_BPF_RELEASE_GH_REPO}/releases/download/${tag}/${tag}.tar.zst" +echo "Downloading $url" +wget -q "$url" + +tarball=${tag}.tar.zst +dir=$(tar tf $tarball | head -1 || true) + +echo "Extracting $tarball ..." +tar -I zstd -xf $tarball && rm -f $tarball + +rm -rf $INSTALL_DIR +mv -v $dir $INSTALL_DIR + +cd - + diff --git a/.github/scripts/matrix.py b/.github/scripts/matrix.py new file mode 100644 index 0000000000000..d62e12a1a2d76 --- /dev/null +++ b/.github/scripts/matrix.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 + +import os +import dataclasses +import json +import requests + +from enum import Enum +from typing import Any, Dict, List, Final, Set, Union, Optional + +MANAGED_OWNER: Final[str] = "kernel-patches" +MANAGED_REPOS: Final[Set[str]] = { + f"{MANAGED_OWNER}/bpf", + f"{MANAGED_OWNER}/vmtest", +} + +DEFAULT_SELF_HOSTED_RUNNER_TAGS: Final[List[str]] = ["self-hosted", "docker-noble-main"] +DEFAULT_GITHUB_HOSTED_RUNNER: Final[str] = "ubuntu-24.04" +DEFAULT_LLVM_VERSION: Final[int] = 17 + +RUNNERS_BUSY_THRESHOLD: Final[float] = 0.8 + + +class Arch(str, Enum): + """ + CPU architecture supported by CI. + """ + + AARCH64 = "aarch64" + S390X = "s390x" + X86_64 = "x86_64" + + +class Compiler(str, Enum): + GCC = "gcc" + LLVM = "llvm" + + +@dataclasses.dataclass +class Toolchain: + compiler: Compiler + # This is relevant ONLY for LLVM and should not be required for GCC + version: int + + @property + def short_name(self) -> str: + return str(self.compiler.value) + + @property + def full_name(self) -> str: + if self.compiler == Compiler.GCC: + return self.short_name + + return f"{self.short_name}-{self.version}" + + def to_dict(self) -> Dict[str, Union[str, int]]: + return { + "name": self.short_name, + "fullname": self.full_name, + "version": self.version, + } + + +def query_runners_from_github() -> List[Dict[str, Any]]: + if "GITHUB_TOKEN" not in os.environ: + return [] + token = os.environ["GITHUB_TOKEN"] + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + owner = os.environ["GITHUB_REPOSITORY_OWNER"] + url: Optional[str] = f"https://api.github.com/orgs/{owner}/actions/runners" + # GitHub returns 30 runners per page, fetch all + all_runners = [] + try: + while url is not None: + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Failed to query runners: {response.status_code}") + print(f"response: {response.text}") + return [] + data = response.json() + all_runners.extend(data.get("runners", [])) + # Check for next page URL in Link header + url = None + if "Link" in response.headers: + links = requests.utils.parse_header_links(response.headers["Link"]) + for link in links: + if link["rel"] == "next": + url = link["url"] + break + return all_runners + except Exception as e: + print(f"Warning: Failed to query runner status due to exception: {e}") + return [] + + +all_runners_cached: Optional[List[Dict[str, Any]]] = None + + +def all_runners() -> List[Dict[str, Any]]: + global all_runners_cached + if all_runners_cached is None: + print("Querying runners from GitHub...") + all_runners_cached = query_runners_from_github() + print(f"Github returned {len(all_runners_cached)} runners") + counts = count_by_status(all_runners_cached) + print( + f"Busy: {counts['busy']}, Idle: {counts['idle']}, Offline: {counts['offline']}" + ) + return all_runners_cached + + +def runner_labels(runner: Dict[str, Any]) -> List[str]: + return [label["name"] for label in runner["labels"]] + + +def is_self_hosted_runner(runner: Dict[str, Any]) -> bool: + labels = runner_labels(runner) + for label in DEFAULT_SELF_HOSTED_RUNNER_TAGS: + if label not in labels: + return False + return True + + +def self_hosted_runners() -> List[Dict[str, Any]]: + runners = all_runners() + return [r for r in runners if is_self_hosted_runner(r)] + + +def runners_by_arch(arch: Arch) -> List[Dict[str, Any]]: + runners = self_hosted_runners() + return [r for r in runners if arch.value in runner_labels(r)] + + +def count_by_status(runners: List[Dict[str, Any]]) -> Dict[str, int]: + result = {"busy": 0, "idle": 0, "offline": 0} + for runner in runners: + if runner["status"] == "online": + if runner["busy"]: + result["busy"] += 1 + else: + result["idle"] += 1 + else: + result["offline"] += 1 + return result + + +@dataclasses.dataclass +class BuildConfig: + arch: Arch + toolchain: Toolchain + kernel: str = "LATEST" + run_veristat: bool = False + parallel_tests: bool = False + build_release: bool = False + + @property + def runs_on(self) -> List[str]: + if is_managed_repo(): + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [self.arch.value] + else: + return [DEFAULT_GITHUB_HOSTED_RUNNER] + + @property + def build_runs_on(self) -> List[str]: + if not is_managed_repo(): + return [DEFAULT_GITHUB_HOSTED_RUNNER] + + # @Temporary: disable codebuild runners for cross-compilation jobs + match self.arch: + case Arch.S390X: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [Arch.X86_64.value] + case Arch.AARCH64: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [Arch.AARCH64.value] + + # For managed repos, check the busyness of relevant self-hosted runners + # If they are too busy, use codebuild + runner_arch = self.arch + # We don't build s390x kernel on s390x runners, because it's too slow + # Cross-compiling on x86_64 is faster + if runner_arch == Arch.S390X: + runner_arch = Arch.X86_64 + runners = runners_by_arch(runner_arch) + counts = count_by_status(runners) + online = counts["idle"] + counts["busy"] + busy = counts["busy"] + # if online <= 0, then something is wrong, don't use codebuild + if online > 0 and busy / online > RUNNERS_BUSY_THRESHOLD: + return ["codebuild"] + else: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [runner_arch.value] + + @property + def tests(self) -> Dict[str, Any]: + tests_list = [ + "test_progs", + "test_progs_parallel", + "test_progs_no_alu32", + "test_progs_no_alu32_parallel", + "test_verifier", + ] + + if self.arch.value != "s390x": + tests_list.append("test_maps") + + if self.toolchain.version >= 18: + tests_list.append("test_progs_cpuv4") + + # if self.arch in [Arch.X86_64, Arch.AARCH64]: + # tests_list.append("sched_ext") + + # Don't run GCC BPF runner, because too many tests are failing + # See: https://lore.kernel.org/bpf/87bjw6qpje.fsf@oracle.com/ + # if self.arch == Arch.X86_64: + # tests_list.append("test_progs-bpf_gcc") + + if not self.parallel_tests: + tests_list = [test for test in tests_list if not test.endswith("parallel")] + + return {"include": [generate_test_config(test) for test in tests_list]} + + def to_dict(self) -> Dict[str, Any]: + return { + "arch": self.arch.value, + "toolchain": self.toolchain.to_dict(), + "kernel": self.kernel, + "run_veristat": self.run_veristat, + "parallel_tests": self.parallel_tests, + "build_release": self.build_release, + "runs_on": self.runs_on, + "tests": self.tests, + "build_runs_on": self.build_runs_on, + } + + +def is_managed_repo() -> bool: + return ( + os.environ["GITHUB_REPOSITORY_OWNER"] == MANAGED_OWNER + and os.environ["GITHUB_REPOSITORY"] in MANAGED_REPOS + ) + + +def set_output(name, value): + """Write an output variable to the GitHub output file.""" + with open(os.getenv("GITHUB_OUTPUT"), "a", encoding="utf-8") as file: + file.write(f"{name}={value}\n") + + +def generate_test_config(test: str) -> Dict[str, Union[str, int]]: + """Create the configuration for the provided test.""" + is_parallel = test.endswith("_parallel") + config = { + "test": test, + "continue_on_error": is_parallel, + # While in experimental mode, parallel jobs may get stuck + # anywhere, including in user space where the kernel won't detect + # a problem and panic. We add a second layer of (smaller) timeouts + # here such that if we get stuck in a parallel run, we hit this + # timeout and fail without affecting the overall job success (as + # would be the case if we hit the job-wide timeout). For + # non-experimental jobs, 360 is the default which will be + # superseded by the overall workflow timeout (but we need to + # specify something). + "timeout_minutes": 30 if is_parallel else 360, + } + return config + + +if __name__ == "__main__": + matrix = [ + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + run_veristat=True, + parallel_tests=True, + ), + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.LLVM, version=DEFAULT_LLVM_VERSION), + build_release=True, + ), + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.LLVM, version=18), + build_release=True, + ), + BuildConfig( + arch=Arch.AARCH64, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + ), + BuildConfig( + arch=Arch.S390X, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + ), + ] + + # Outside of managed repositories only run on x86_64 + if not is_managed_repo(): + matrix = [config for config in matrix if config.arch == Arch.X86_64] + + json_matrix = json.dumps({"include": [config.to_dict() for config in matrix]}) + print(json.dumps(json.loads(json_matrix), indent=4)) + set_output("build_matrix", json_matrix) diff --git a/.github/scripts/tests/test_veristat_compare.py b/.github/scripts/tests/test_veristat_compare.py new file mode 100644 index 0000000000000..b65b69295235d --- /dev/null +++ b/.github/scripts/tests/test_veristat_compare.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import unittest +from typing import Iterable, List + +from ..veristat_compare import parse_table, VeristatFields + + +def gen_csv_table(records: Iterable[str]) -> List[str]: + return [ + ",".join(VeristatFields.headers()), + *records, + ] + + +class TestVeristatCompare(unittest.TestCase): + def test_parse_table_ignore_new_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,N/A,success,N/A,N/A,1,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_ignore_removed_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,N/A,N/A,1,N/A,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_new_failure(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,failure,MISMATCH,1,1,+0 (+0.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [["prog_file.bpf.o", "prog_name", "success -> failure (!!)", "+0.00 %"]], + ) + self.assertTrue(veristat_info.changes) + self.assertTrue(veristat_info.new_failures) + + def test_parse_table_new_changes(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,failure,success,MISMATCH,0,0,+0 (+0.00%)", + "prog_file.bpf.o,prog_name_increase,failure,failure,MATCH,1,2,+1 (+100.00%)", + "prog_file.bpf.o,prog_name_decrease,success,success,MATCH,1,1,-1 (-100.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [ + ["prog_file.bpf.o", "prog_name", "failure -> success", "+0.00 %"], + ["prog_file.bpf.o", "prog_name_increase", "failure", "+100.00 %"], + ["prog_file.bpf.o", "prog_name_decrease", "success", "-100.00 %"], + ], + ) + self.assertTrue(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/tmpfsify-workspace.sh b/.github/scripts/tmpfsify-workspace.sh new file mode 100755 index 0000000000000..6fd62b4ad2a49 --- /dev/null +++ b/.github/scripts/tmpfsify-workspace.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -x -euo pipefail + +TMPFS_SIZE=20 # GB +MEM_TOTAL=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo) + +# sanity check: total mem is at least double TMPFS_SIZE +if [ $MEM_TOTAL -lt $(($TMPFS_SIZE*1024*2)) ]; then + echo "tmpfsify-workspace.sh: will not allocate tmpfs, total memory is too low (${MEM_TOTAL}MB)" + exit 0 +fi + +dir="$(basename "$GITHUB_WORKSPACE")" +cd "$(dirname "$GITHUB_WORKSPACE")" +mv "${dir}" "${dir}.backup" +mkdir "${dir}" +sudo mount -t tmpfs -o size=${TMPFS_SIZE}G tmpfs "${dir}" +rsync -a "${dir}.backup/" "${dir}" +cd - + diff --git a/.github/scripts/veristat_compare.py b/.github/scripts/veristat_compare.py new file mode 100644 index 0000000000000..07271b8cbd3aa --- /dev/null +++ b/.github/scripts/veristat_compare.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +# This script reads a CSV file produced by the following invocation: +# +# veristat --emit file,prog,verdict,states \ +# --output-format csv \ +# --compare ... +# +# And produces a markdown summary for the file. +# The summary is printed to standard output and appended to a file +# pointed to by GITHUB_STEP_SUMMARY variable. +# +# Script exits with return code 1 if there are new failures in the +# veristat results. +# +# For testing purposes invoke as follows: +# +# GITHUB_STEP_SUMMARY=/dev/null python3 veristat-compare.py test.csv +# +# File format (columns): +# 0. file_name +# 1. prog_name +# 2. verdict_base +# 3. verdict_comp +# 4. verdict_diff +# 5. total_states_base +# 6. total_states_comp +# 7. total_states_diff +# +# Records sample: +# file-a,a,success,failure,MISMATCH,12,12,+0 (+0.00%) +# file-b,b,success,success,MATCH,67,67,+0 (+0.00%) +# +# For better readability suffixes '_OLD' and '_NEW' +# are used instead of '_base' and '_comp' for variable +# names etc. + +import io +import os +import sys +import re +import csv +import logging +import argparse +import enum +from dataclasses import dataclass +from typing import Dict, Iterable, List, Final + + +TRESHOLD_PCT: Final[int] = 0 + +SUMMARY_HEADERS = ["File", "Program", "Verdict", "States Diff (%)"] + +# expected format: +0 (+0.00%) / -0 (-0.00%) +TOTAL_STATES_DIFF_REGEX = ( + r"(?P[+-]\d+) \((?P[+-]\d+\.\d+)\%\)" +) + + +TEXT_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +{table} +""".strip() +) + +HTML_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +
+Click to expand + +{table} +
+""".strip() +) + +GITHUB_MARKUP_REPLACEMENTS: Final[Dict[str, str]] = { + "->": "→", + "(!!)": ":bangbang:", +} + +NEW_FAILURE_SUFFIX: Final[str] = "(!!)" + + +class VeristatFields(str, enum.Enum): + FILE_NAME = "file_name" + PROG_NAME = "prog_name" + VERDICT_OLD = "verdict_base" + VERDICT_NEW = "verdict_comp" + VERDICT_DIFF = "verdict_diff" + TOTAL_STATES_OLD = "total_states_base" + TOTAL_STATES_NEW = "total_states_comp" + TOTAL_STATES_DIFF = "total_states_diff" + + @classmethod + def headers(cls) -> List[str]: + return [ + cls.FILE_NAME, + cls.PROG_NAME, + cls.VERDICT_OLD, + cls.VERDICT_NEW, + cls.VERDICT_DIFF, + cls.TOTAL_STATES_OLD, + cls.TOTAL_STATES_NEW, + cls.TOTAL_STATES_DIFF, + ] + + +@dataclass +class VeristatInfo: + table: list + changes: bool + new_failures: bool + + def get_results_title(self) -> str: + if self.new_failures: + return "There are new veristat failures" + + if self.changes: + return "There are changes in verification performance" + + return "No changes in verification performance" + + def get_results_summary(self, markup: bool = False) -> str: + title = self.get_results_title() + if not self.table: + return f"# {title}\n" + + template = TEXT_SUMMARY_TEMPLATE + table = format_table(headers=SUMMARY_HEADERS, rows=self.table) + + if markup: + template = HTML_SUMMARY_TEMPLATE + table = github_markup_decorate(table) + + return template.format(title=title, table=table) + + +def get_state_diff(value: str) -> float: + if value == "N/A": + return 0.0 + + matches = re.match(TOTAL_STATES_DIFF_REGEX, value) + if not matches: + raise ValueError(f"Failed to parse total states diff field value '{value}'") + + if percentage_diff := matches.group("percentage_diff"): + return float(percentage_diff) + + raise ValueError(f"Invalid {VeristatFields.TOTAL_STATES_DIFF} field value: {value}") + + +def parse_table(csv_file: Iterable[str]) -> VeristatInfo: + reader = csv.DictReader(csv_file) + assert reader.fieldnames == VeristatFields.headers() + + new_failures = False + changes = False + table = [] + + for record in reader: + add = False + + verdict_old, verdict_new = ( + record[VeristatFields.VERDICT_OLD], + record[VeristatFields.VERDICT_NEW], + ) + + # Ignore results from completely new and removed programs + if "N/A" in [verdict_new, verdict_old]: + continue + + if record[VeristatFields.VERDICT_DIFF] == "MISMATCH": + changes = True + add = True + verdict = f"{verdict_old} -> {verdict_new}" + if verdict_new == "failure": + new_failures = True + verdict += f" {NEW_FAILURE_SUFFIX}" + else: + verdict = record[VeristatFields.VERDICT_NEW] + + diff = get_state_diff(record[VeristatFields.TOTAL_STATES_DIFF]) + if abs(diff) > TRESHOLD_PCT: + changes = True + add = True + + if not add: + continue + + table.append( + [ + record[VeristatFields.FILE_NAME], + record[VeristatFields.PROG_NAME], + verdict, + f"{diff:+.2f} %", + ] + ) + + return VeristatInfo(table=table, changes=changes, new_failures=new_failures) + + +def github_markup_decorate(input_str: str) -> str: + for text, markup in GITHUB_MARKUP_REPLACEMENTS.items(): + input_str = input_str.replace(text, markup) + return input_str + + +def format_table(headers: List[str], rows: List[List[str]]) -> str: + column_width = [ + max(len(row[column_idx]) for row in [headers] + rows) + for column_idx in range(len(headers)) + ] + + # Row template string in the following format: + # "{0:8}|{1:10}|{2:15}|{3:7}|{4:10}" + row_template = "|".join( + f"{{{idx}:{width}}}" for idx, width in enumerate(column_width) + ) + row_template_nl = f"|{row_template}|\n" + + with io.StringIO() as out: + out.write(row_template_nl.format(*headers)) + + separator_row = ["-" * width for width in column_width] + out.write(row_template_nl.format(*separator_row)) + + for row in rows: + row_str = row_template_nl.format(*row) + out.write(row_str) + + return out.getvalue() + + +def main(compare_csv_filename: os.PathLike, output_filename: os.PathLike) -> None: + with open(compare_csv_filename, newline="", encoding="utf-8") as csv_file: + veristat_results = parse_table(csv_file) + + sys.stdout.write(veristat_results.get_results_summary()) + + with open(output_filename, encoding="utf-8", mode="a") as file: + file.write(veristat_results.get_results_summary(markup=True)) + + if veristat_results.new_failures: + return 1 + + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Print veristat comparison output as markdown step summary" + ) + parser.add_argument("filename") + args = parser.parse_args() + summary_filename = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_filename: + logging.error("GITHUB_STEP_SUMMARY environment variable is not set") + sys.exit(1) + sys.exit(main(args.filename, summary_filename)) diff --git a/.github/workflows/gcc-bpf.yml b/.github/workflows/gcc-bpf.yml new file mode 100644 index 0000000000000..8356a0af836f1 --- /dev/null +++ b/.github/workflows/gcc-bpf.yml @@ -0,0 +1,100 @@ +name: Testing GCC BPF compiler + +on: + workflow_call: + inputs: + runs_on: + required: true + type: string + arch: + required: true + type: string + llvm-version: + required: true + type: string + toolchain: + required: true + type: string + toolchain_full: + required: true + type: string + download_sources: + required: true + type: boolean + +jobs: + test: + name: GCC BPF + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARCH: ${{ inputs.arch }} + BPF_NEXT_BASE_BRANCH: 'master' + GCC_BPF_INSTALL_DIR: ${{ github.workspace }}/gcc-bpf + GCC_BPF_RELEASE_REPO: 'theihor/gcc-bpf' + KBUILD_OUTPUT: ${{ github.workspace }}/src/kbuild-output + REPO_ROOT: ${{ github.workspace }}/src + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + uses: libbpf/ci/get-linux-source@v3 + with: + dest: ${{ env.REPO_ROOT }} + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + + - if: ${{ ! inputs.download_sources }} + name: Checkout ${{ github.repository }} to ./src + uses: actions/checkout@v4 + with: + path: 'src' + + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: ${{ env.REPO_ROOT }} + + - name: Untar artifacts + working-directory: ${{ env.REPO_ROOT }} + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + + - name: Download GCC BPF compiler + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: .github/scripts/download-gcc-bpf.sh ${{ env.GCC_BPF_RELEASE_REPO }} ${{ env.GCC_BPF_INSTALL_DIR }} + + - name: Build selftests/bpf/test_progs-bpf_gcc + uses: libbpf/ci/build-selftests@v3 + env: + BPF_GCC: ${{ env.GCC_BPF_INSTALL_DIR }} + MAX_MAKE_JOBS: 32 + SELFTESTS_BPF_TARGETS: 'test_progs-bpf_gcc' + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.REPO_ROOT }} + llvm-version: ${{ inputs.llvm-version }} + toolchain: ${{ inputs.toolchain }} + diff --git a/.github/workflows/kernel-build-test.yml b/.github/workflows/kernel-build-test.yml new file mode 100644 index 0000000000000..541ef910b1d28 --- /dev/null +++ b/.github/workflows/kernel-build-test.yml @@ -0,0 +1,147 @@ +name: Reusable Build/Test/Veristat workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + build_runs_on: + required: true + type: string + description: The runners to run the builds on. This is a json string representing an array of labels. + llvm-version: + required: true + type: string + description: The version of LLVM used to build selftest.... for llvm toolchain, this should match the one from toolchain_full, for gcc it is an arbritrary version we decide to build selftests against. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + tests: + required: true + type: string + description: A serialized json array with the tests to be running, it must follow the json-matrix format, https://www.jitsejan.com/use-github-actions-with-json-file-as-matrix + run_veristat: + required: true + type: boolean + description: Whether or not to run the veristat job. + run_tests: + required: true + type: boolean + description: Whether or not to run the test job. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + build_release: + required: true + type: boolean + description: Build selftests with -O2 optimization in addition to non-optimized build. + default: false + secrets: + AWS_ROLE_ARN: + required: true + +jobs: + + # Build kernel and selftest + build: + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + llvm-version: ${{ inputs.llvm-version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + + build-release: + if: ${{ inputs.build_release }} + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + llvm-version: ${{ inputs.llvm-version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + release: true + + test: + if: ${{ inputs.run_tests }} + uses: ./.github/workflows/kernel-test.yml + # Setting name to test here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "test" + needs: [build] + strategy: + fail-fast: false + matrix: ${{ fromJSON(inputs.tests) }} + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + kernel: ${{ inputs.kernel }} + test: ${{ matrix.test }} + continue_on_error: ${{ toJSON(matrix.continue_on_error) }} + timeout_minutes: ${{ matrix.timeout_minutes }} + + veristat-kernel: + if: ${{ inputs.run_veristat }} + uses: ./.github/workflows/veristat-kernel.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.runs_on }} + + veristat-meta: + # Check for vars.AWS_REGION is necessary to skip this job in case of a PR from a fork. + if: ${{ inputs.run_veristat && github.repository_owner == 'kernel-patches' && vars.AWS_REGION }} + uses: ./.github/workflows/veristat-meta.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + aws_region: ${{ vars.AWS_REGION }} + runs_on: ${{ inputs.runs_on }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + gcc-bpf: + name: 'GCC BPF' + if: ${{ inputs.arch == 'x86_64' }} + uses: ./.github/workflows/gcc-bpf.yml + needs: [build] + with: + # GCC BPF does not need /dev/kvm, so use the "build" runners + runs_on: ${{ inputs.build_runs_on }} + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + toolchain: ${{ inputs.toolchain }} + toolchain_full: ${{ inputs.toolchain_full }} + download_sources: ${{ inputs.download_sources }} + diff --git a/.github/workflows/kernel-build.yml b/.github/workflows/kernel-build.yml new file mode 100644 index 0000000000000..d2866a6c4fb44 --- /dev/null +++ b/.github/workflows/kernel-build.yml @@ -0,0 +1,191 @@ + +name: Reusable build workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + llvm-version: + required: true + type: string + description: The version of LLVM used to build selftest.... for llvm toolchain, this should match the one from toolchain_full, for gcc it is an arbritrary version we decide to build selftests against. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + release: + required: false + type: boolean + description: Build selftest with -O2 optimization + default: false + +jobs: + build: + name: build for ${{ inputs.arch }} with ${{ inputs.toolchain_full }}${{ inputs.release && '-O2' || '' }} + # To run on CodeBuild, runs-on value must correspond to the AWS + # CodeBuild project associated with the kernel-patches webhook + # However matrix.py passes just a 'codebuild' string + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARTIFACTS_ARCHIVE: "vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst" + BPF_NEXT_BASE_BRANCH: 'master' + BPF_NEXT_FETCH_DEPTH: 64 # A bit of history is needed to facilitate incremental builds + CROSS_COMPILE: ${{ inputs.arch != 'x86_64' && 'true' || '' }} + # BUILD_SCHED_EXT_SELFTESTS: ${{ inputs.arch == 'x86_64' || inputs.arch == 'aarch64' && 'true' || '' }} + KBUILD_OUTPUT: ${{ github.workspace }}/kbuild-output + KERNEL: ${{ inputs.kernel }} + KERNEL_ROOT: ${{ github.workspace }} + REPO_PATH: "" + REPO_ROOT: ${{ github.workspace }} + RUNNER_TYPE: ${{ contains(fromJSON(inputs.runs_on), 'codebuild') && 'codebuild' || 'default' }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: ${{ inputs.download_sources && 1 || env.BPF_NEXT_FETCH_DEPTH }} + + - if: ${{ env.RUNNER_TYPE == 'codebuild' }} + shell: bash + run: .github/scripts/tmpfsify-workspace.sh + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + env: + FETCH_DEPTH: ${{ env.BPF_NEXT_FETCH_DEPTH }} + uses: libbpf/ci/get-linux-source@v3 + with: + dest: '.kernel' + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + - uses: libbpf/ci/prepare-incremental-build@v3 + with: + repo-root: ${{ inputs.download_sources && '.kernel' || env.REPO_ROOT }} + base-branch: >- + ${{ inputs.download_sources && env.BPF_NEXT_BASE_BRANCH + || github.event_name == 'pull_request' && github.base_ref + || github.ref_name + }} + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + - if: ${{ inputs.download_sources }} + name: Move linux source in place + shell: bash + run: | + cd .kernel + rm -rf .git + mv -t .. $(ls -A) + cd .. + rmdir .kernel + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + pahole: master + + # We have to setup qemu+binfmt in order to enable cross-compation of selftests. + # During selftests build, freshly built bpftool is executed. + # On self-hosted bare-metal hosts binfmt is pre-configured. + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Set up docker + uses: docker/setup-docker-action@v4 + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Setup binfmt and qemu + uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:qemu-v9.2.0 + + - name: Build kernel image + uses: libbpf/ci/build-linux@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + + - name: Build selftests/bpf + uses: libbpf/ci/build-selftests@v3 + env: + MAX_MAKE_JOBS: 32 + RELEASE: ${{ inputs.release && '1' || '' }} + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.KERNEL_ROOT }} + llvm-version: ${{ inputs.llvm-version }} + toolchain: ${{ inputs.toolchain }} + + - if: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + name: Build selftests/sched_ext + uses: libbpf/ci/build-scx-selftests@v3 + with: + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + llvm-version: ${{ inputs.llvm-version }} + max-make-jobs: 32 + + - if: ${{ github.event_name != 'push' }} + name: Build samples + uses: libbpf/ci/build-samples@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + - name: Tar artifacts + id: tar-artifacts + uses: libbpf/ci/tar-artifacts@v3 + env: + ARCHIVE_BPF_SELFTESTS: 'true' + ARCHIVE_MAKE_HELPERS: 'true' + ARCHIVE_SCHED_EXT_SELFTESTS: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + with: + arch: ${{ inputs.arch }} + archive: ${{ env.ARTIFACTS_ARCHIVE }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + - if: ${{ github.event_name != 'push' }} + name: Remove KBUILD_OUTPUT content + shell: bash + run: | + # Remove $KBUILD_OUTPUT to prevent cache creation for pull requests. + # Only on pushed changes are build artifacts actually cached, because + # of github.com/actions/cache's cache isolation logic. + rm -rf "${KBUILD_OUTPUT}" + - uses: actions/upload-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}${{ inputs.release && '-release' || '' }} + if-no-files-found: error + path: ${{ env.ARTIFACTS_ARCHIVE }} diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml new file mode 100644 index 0000000000000..2885f2759de4a --- /dev/null +++ b/.github/workflows/kernel-test.yml @@ -0,0 +1,96 @@ +name: Reusable test workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + test: + required: true + type: string + description: The test to run in the vm, e.g test_progs, test_maps, test_progs_no_alu32... + continue_on_error: + required: true + type: string + description: Whether to continue on error. This is typically set to true for parallel tests which are currently known to fail, but we don't want to fail the whole CI because of that. + timeout_minutes: + required: true + type: number + description: In case a test runs for too long, after how many seconds shall we timeout and error. + +jobs: + test: + name: ${{ inputs.test }} on ${{ inputs.arch }} with ${{ inputs.toolchain_full }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + env: + ARCH: ${{ inputs.arch }} + KERNEL: ${{ inputs.kernel }} + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + CONTINUE_ON_ERROR: ${{ inputs.continue_on_error }} + DEPLOYMENT: ${{ github.repository == 'kernel-patches/bpf' && 'prod' || 'rc' }} + ALLOWLIST_FILE: /tmp/allowlist + DENYLIST_FILE: /tmp/denylist + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: . + + - name: Untar artifacts + # zstd is installed by default in the runner images. + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Run selftests + uses: libbpf/ci/run-vmtest@v3 + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + continue-on-error: ${{ fromJSON(env.CONTINUE_ON_ERROR) }} + timeout-minutes: ${{ inputs.timeout_minutes }} + env: + ARCH: ${{ inputs.arch }} + DEPLOYMENT: ${{ env.DEPLOYMENT }} + KERNEL_TEST: ${{ inputs.test }} + SELFTESTS_BPF: ${{ github.workspace }}/selftests/bpf + VMTEST_CONFIGS: ${{ github.workspace }}/ci/vmtest/configs + TEST_PROGS_TRAFFIC_MONITOR: ${{ inputs.arch == 'x86_64' && 'true' || '' }} + TEST_PROGS_WATCHDOG_TIMEOUT: 600 + with: + arch: ${{ inputs.arch }} + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: ${{ env.REPO_ROOT }} + max-cpu: 8 + kernel-test: ${{ inputs.test }} + # Here we must use kbuild-output local to the repo, because + # it was extracted from the artifacts. + kbuild-output: ${{ env.REPO_ROOT }}/kbuild-output + + - if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tmon-logs-${{ inputs.arch }}-${{ inputs.toolchain_full }}-${{ inputs.test }} + if-no-files-found: ignore + path: /tmp/tmon_pcap/* diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000..1c910fd297309 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,65 @@ +name: "lint" + +on: + pull_request: + push: + branches: + - master + +jobs: + shellcheck: + # This workflow gets injected into other Linux repositories, but we don't + # want it to run there. + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: ShellCheck + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master + env: + SHELLCHECK_OPTS: --severity=warning --exclude=SC1091 + + # Ensure some consistency in the formatting. + lint: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run black + uses: psf/black@stable + with: + src: ./.github/scripts + + validate_matrix: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Validate matrix.py + runs-on: ubuntu-latest + env: + GITHUB_REPOSITORY_OWNER: ${{ matrix.owner }} + GITHUB_REPOSITORY: ${{ matrix.repository }} + GITHUB_OUTPUT: /dev/stdout + strategy: + matrix: + owner: ['kernel-patches', 'foo'] + repository: ['bpf', 'vmtest', 'bar'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: run script + run: | + python3 .github/scripts/matrix.py + + unittests: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Unittests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run unittests + run: python3 -m unittest scripts/tests/*.py + working-directory: .github diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000000000..16421f53e5e26 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,72 @@ +name: bpf-ci + +on: + pull_request: + push: + branches: + - bpf_base + - bpf-next_base + - bpf-net_base + - for-next_base + +concurrency: + group: ci-test-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + set-matrix: + # FIXME: set-matrix is lightweight, run it on any self-hosted machines for kernel-patches org + # so we do not wait for GH hosted runners when there potentially all are busy because of bpf-rc + # repo for instance. + # This could be somehow fixed long term by making this action/workflow re-usable and letting the called + # specify what to run on. + runs-on: ${{ github.repository_owner == 'kernel-patches' && 'x86_64' || 'ubuntu-latest' }} + permissions: read-all + outputs: + build-matrix: ${{ steps.set-matrix-impl.outputs.build_matrix }} + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + - name: Install script dependencies + shell: bash + run: | + sudo apt-get -y update + sudo apt-get -y install python3-requests + - id: set-matrix-impl + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_READ_RUNNERS }} + run: | + python3 .github/scripts/matrix.py + + build-and-test: + # Setting name to arch-compiler here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "${{ matrix.arch }}-${{ matrix.toolchain.fullname }}" + uses: ./.github/workflows/kernel-build-test.yml + needs: [set-matrix] + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.build-matrix) }} + with: + arch: ${{ matrix.arch }} + toolchain_full: ${{ matrix.toolchain.fullname }} + toolchain: ${{ matrix.toolchain.name }} + runs_on: ${{ toJSON(matrix.runs_on) }} + build_runs_on: ${{ toJSON(matrix.build_runs_on) }} + llvm-version: ${{ matrix.toolchain.version }} + kernel: ${{ matrix.kernel }} + tests: ${{ toJSON(matrix.tests) }} + run_veristat: ${{ matrix.run_veristat }} + # We only run tests on pull requests. + run_tests: ${{ github.event_name != 'push' }} + # Download sources + download_sources: ${{ github.repository == 'kernel-patches/vmtest' }} + build_release: ${{ matrix.build_release }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/.github/workflows/veristat-kernel.yml b/.github/workflows/veristat-kernel.yml new file mode 100644 index 0000000000000..0557daebf5bab --- /dev/null +++ b/.github/workflows/veristat-kernel.yml @@ -0,0 +1,65 @@ +name: veristat_kernel + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + +jobs: + veristat: + name: ${{ inputs.arch }}-${{ inputs.toolchain }} veristat_kernel + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain }} + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat_kernel' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.kernel.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-kernel + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-kernel + diff --git a/.github/workflows/veristat-meta.yml b/.github/workflows/veristat-meta.yml new file mode 100644 index 0000000000000..00700a8205e62 --- /dev/null +++ b/.github/workflows/veristat-meta.yml @@ -0,0 +1,87 @@ +name: veristat_meta + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + aws_region: + required: true + type: string + description: The AWS region where we pull bpf objects to run against veristat. + secrets: + AWS_ROLE_ARN: + required: true + description: The AWS role used by GH to pull BPF objects from AWS. + +jobs: + veristat: + name: ${{ inputs.arch }}-${{ inputs.toolchain }} veristat_meta + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain }} + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-region: ${{ inputs.aws_region }} + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + role-session-name: github-action-bpf-ci + + - name: Download BPF objects + run: | + mkdir ./bpf_objects + aws s3 sync s3://veristat-bpf-binaries ./bpf_objects + env: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat_meta' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.meta.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-meta + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-meta + diff --git a/.mailmap b/.mailmap index 4f7cd8e231778..6efaee6537e49 100644 --- a/.mailmap +++ b/.mailmap @@ -322,6 +322,7 @@ Jayachandran C Jayachandran C Jayachandran C +Jean-Michel Hautbois Jean Tourrilhes Jeevan Shriram Jeff Garzik @@ -438,6 +439,8 @@ Linus Lüssing Li Yang Li Yang Lior David +Loic Poulain +Loic Poulain Lorenzo Pieralisi Lorenzo Stoakes Luca Ceresoli diff --git a/CREDITS b/CREDITS index 1b77fba6c27ec..f74d230992d6c 100644 --- a/CREDITS +++ b/CREDITS @@ -2071,6 +2071,10 @@ S: 660 Harvard Ave. #7 S: Santa Clara, CA 95051 S: USA +N: Joonsoo Kim +E: iamjoonsoo.kim@lge.com +D: Slab allocators + N: Kukjin Kim E: kgene@kernel.org D: Samsung S3C, S5P and Exynos ARM architectures diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 3879963f0f01e..11545c9e2e93f 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -77,7 +77,7 @@ Description: What: /sys/block//diskseq Date: February 2021 -Contact: Matteo Croce +Contact: Matteo Croce Description: The /sys/block//diskseq files reports the disk sequence number, which is a monotonically increasing diff --git a/Documentation/ABI/testing/sysfs-kernel-reboot b/Documentation/ABI/testing/sysfs-kernel-reboot index e117aba46be0e..52571fd5ddba5 100644 --- a/Documentation/ABI/testing/sysfs-kernel-reboot +++ b/Documentation/ABI/testing/sysfs-kernel-reboot @@ -1,7 +1,7 @@ What: /sys/kernel/reboot Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Interface to set the kernel reboot behavior, similarly to what can be done via the reboot= cmdline option. (see Documentation/admin-guide/kernel-parameters.txt) @@ -9,25 +9,25 @@ Description: Interface to set the kernel reboot behavior, similarly to What: /sys/kernel/reboot/mode Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Reboot mode. Valid values are: cold warm hard soft gpio What: /sys/kernel/reboot/type Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Reboot type. Valid values are: bios acpi kbd triple efi pci What: /sys/kernel/reboot/cpu Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: CPU number to use to reboot. What: /sys/kernel/reboot/force Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Don't wait for any other CPUs on reboot and avoid anything that could hang. diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 9bdb30901a930..b8d36134a151e 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -369,6 +369,23 @@ they could write a page index into the interface:: echo "page_index=1251" > /sys/block/zramX/writeback +In Linux 6.16 this interface underwent some rework. First, the interface +now supports `key=value` format for all of its parameters (`type=huge_idle`, +etc.) Second, the support for `page_indexes` was introduced, which specify +`LOW-HIGH` range (or ranges) of pages to be written-back. This reduces the +number of syscalls, but more importantly this enables optimal post-processing +target selection strategy. Usage example:: + + echo "type=idle" > /sys/block/zramX/writeback + echo "page_indexes=1-100 page_indexes=200-300" > \ + /sys/block/zramX/writeback + +We also now permit multiple page_index params per call and a mix of +single pages and page ranges:: + + echo page_index=42 page_index=99 page_indexes=100-200 \ + page_indexes=500-700 > /sys/block/zramX/writeback + If there are lots of write IO with flash device, potentially, it has flash wearout problem so that admin needs to design write limitation to guarantee storage health for entire product life. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 1a16ce68a4d7f..d346f32359452 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1670,6 +1670,12 @@ The following nested keys are defined. numa_hint_faults (npn) Number of NUMA hinting faults. + numa_task_migrated (npn) + Number of task migration by NUMA balancing. + + numa_task_swapped (npn) + Number of task swap by NUMA balancing. + pgdemote_kswapd Number of pages demoted by kswapd. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d9fd26b95b340..54cb1d46e41f7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2725,6 +2725,31 @@ kgdbwait [KGDB,EARLY] Stop kernel execution and enter the kernel debugger at the earliest opportunity. + kho= [KEXEC,EARLY] + Format: { "0" | "1" | "off" | "on" | "y" | "n" } + Enables or disables Kexec HandOver. + "0" | "off" | "n" - kexec handover is disabled + "1" | "on" | "y" - kexec handover is enabled + + kho_scratch= [KEXEC,EARLY] + Format: ll[KMG],mm[KMG],nn[KMG] | nn% + Defines the size of the KHO scratch region. The KHO + scratch regions are physically contiguous memory + ranges that can only be used for non-kernel + allocations. That way, even when memory is heavily + fragmented with handed over memory, the kexeced + kernel will always have enough contiguous ranges to + bootstrap itself. + + It is possible to specify the exact amount of + memory in the form of "ll[KMG],mm[KMG],nn[KMG]" + where the first parameter defines the size of a low + memory scratch area, the second parameter defines + the size of a global scratch area and the third + parameter defines the size of additional per-node + scratch areas. The form "nn%" defines scale factor + (in percents) of memory that was used during boot. + kmac= [MIPS] Korina ethernet MAC address. Configure the RouterBoard 532 series on-chip Ethernet adapter MAC address. diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index 8b35795b664be..2d2f6c222308f 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -42,3 +42,4 @@ the Linux memory management. transhuge userfaultfd zswap + kho diff --git a/Documentation/admin-guide/mm/kho.rst b/Documentation/admin-guide/mm/kho.rst new file mode 100644 index 0000000000000..c64aa7aadb300 --- /dev/null +++ b/Documentation/admin-guide/mm/kho.rst @@ -0,0 +1,120 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +==================== +Kexec Handover Usage +==================== + +Kexec HandOver (KHO) is a mechanism that allows Linux to preserve memory +regions, which could contain serialized system states, across kexec. + +This document expects that you are familiar with the base KHO +:ref:`concepts `. If you have not read +them yet, please do so now. + +Prerequisites +============= + +KHO is available when the ``CONFIG_KEXEC_HANDOVER`` config option is set to y +at compile time. Every KHO producer may have its own config option that you +need to enable if you would like to preserve their respective state across +kexec. + +To use KHO, please boot the kernel with the ``kho=on`` command line +parameter. You may use ``kho_scratch`` parameter to define size of the +scratch regions. For example ``kho_scratch=16M,512M,256M`` will reserve a +16 MiB low memory scratch area, a 512 MiB global scratch region, and 256 MiB +per NUMA node scratch regions on boot. + +Perform a KHO kexec +=================== + +First, before you perform a KHO kexec, you need to move the system into +the :ref:`KHO finalization phase ` :: + + $ echo 1 > /sys/kernel/debug/kho/out/finalize + +After this command, the KHO FDT is available in +``/sys/kernel/debug/kho/out/fdt``. Other subsystems may also register +their own preserved sub FDTs under +``/sys/kernel/debug/kho/out/sub_fdts/``. + +Next, load the target payload and kexec into it. It is important that you +use the ``-s`` parameter to use the in-kernel kexec file loader, as user +space kexec tooling currently has no support for KHO with the user space +based file loader :: + + # kexec -l Image --initrd=initrd -s + # kexec -e + +The new kernel will boot up and contain some of the previous kernel's state. + +For example, if you used ``reserve_mem`` command line parameter to create +an early memory reservation, the new kernel will have that memory at the +same physical address as the old kernel. + +Abort a KHO exec +================ + +You can move the system out of KHO finalization phase again by calling :: + + $ echo 0 > /sys/kernel/debug/kho/out/active + +After this command, the KHO FDT is no longer available in +``/sys/kernel/debug/kho/out/fdt``. + +debugfs Interfaces +================== + +Currently KHO creates the following debugfs interfaces. Notice that these +interfaces may change in the future. They will be moved to sysfs once KHO is +stabilized. + +``/sys/kernel/debug/kho/out/finalize`` + Kexec HandOver (KHO) allows Linux to transition the state of + compatible drivers into the next kexec'ed kernel. To do so, + device drivers will instruct KHO to preserve memory regions, + which could contain serialized kernel state. + While the state is serialized, they are unable to perform + any modifications to state that was serialized, such as + handed over memory allocations. + + When this file contains "1", the system is in the transition + state. When contains "0", it is not. To switch between the + two states, echo the respective number into this file. + +``/sys/kernel/debug/kho/out/fdt`` + When KHO state tree is finalized, the kernel exposes the + flattened device tree blob that carries its current KHO + state in this file. Kexec user space tooling can use this + as input file for the KHO payload image. + +``/sys/kernel/debug/kho/out/scratch_len`` + To support continuous KHO kexecs, we need to reserve + physically contiguous memory regions that will always stay + available for future kexec allocations. This file describes + the length of these memory regions. Kexec user space tooling + can use this to determine where it should place its payload + images. + +``/sys/kernel/debug/kho/out/scratch_phys`` + To support continuous KHO kexecs, we need to reserve + physically contiguous memory regions that will always stay + available for future kexec allocations. This file describes + the physical location of these memory regions. Kexec user space + tooling can use this to determine where it should place its + payload images. + +``/sys/kernel/debug/kho/out/sub_fdts/`` + In the KHO finalization phase, KHO producers register their own + FDT blob under this directory. + +``/sys/kernel/debug/kho/in/fdt`` + When the kernel was booted with Kexec HandOver (KHO), + the state tree that carries metadata about the previous + kernel's state is in this file in the format of flattened + device tree. This file may disappear when all consumers of + it finished to interpret their metadata. + +``/sys/kernel/debug/kho/in/sub_fdts/`` + Similar to ``kho/out/sub_fdts/``, but contains sub FDT blobs + of KHO producers passed from the old kernel. diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index afce291649dd6..e60e9211fd9b2 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -250,6 +250,7 @@ Following flags about pages are currently supported: - ``PAGE_IS_PFNZERO`` - Page has zero PFN - ``PAGE_IS_HUGE`` - Page is PMD-mapped THP or Hugetlb backed - ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty +- ``PAGE_IS_GUARD`` - Page is a part of a guard region The ``struct pm_scan_arg`` is used as the argument of the IOCTL. diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 8290177b4f758..b325bfbc2611f 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -131,6 +131,12 @@ to latency spikes in unsuspecting applications. The kernel employs various heuristics to avoid wasting CPU cycles if it detects that proactive compaction is not being effective. +Setting the value above 80 will, in addition to lowering the acceptable level +of fragmentation, make the compaction code more sensitive to increases in +fragmentation, i.e. compaction will trigger more often, but reduce +fragmentation by a smaller amount. +This makes the fragmentation level more stable over time. + Be careful when setting it to extreme values like 100, as that may cause excessive background compaction activity. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index e9789bd381d80..7a4ca18ca6e2d 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -115,6 +115,7 @@ more memory-management documentation in Documentation/mm/index.rst. pin_user_pages boot-time-mm gfp_mask-from-fs-io + kho/index Interfaces for kernel debugging =============================== diff --git a/Documentation/core-api/kho/bindings/kho.yaml b/Documentation/core-api/kho/bindings/kho.yaml new file mode 100644 index 0000000000000..11e8ab7b219d9 --- /dev/null +++ b/Documentation/core-api/kho/bindings/kho.yaml @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +title: Kexec HandOver (KHO) root tree + +maintainers: + - Mike Rapoport + - Changyuan Lyu + +description: | + System memory preserved by KHO across kexec. + +properties: + compatible: + enum: + - kho-v1 + + preserved-memory-map: + description: | + physical address (u64) of an in-memory structure describing all preserved + folios and memory ranges. + +patternProperties: + "$[0-9a-f_]+^": + $ref: sub-fdt.yaml# + description: physical address of a KHO user's own FDT. + +required: + - compatible + - preserved-memory-map + +additionalProperties: false + +examples: + - | + kho { + compatible = "kho-v1"; + preserved-memory-map = <0xf0be16 0x1000000>; + + memblock { + fdt = <0x80cc16 0x1000000>; + }; + }; diff --git a/Documentation/core-api/kho/bindings/memblock/memblock.yaml b/Documentation/core-api/kho/bindings/memblock/memblock.yaml new file mode 100644 index 0000000000000..d388c28eb91d1 --- /dev/null +++ b/Documentation/core-api/kho/bindings/memblock/memblock.yaml @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +title: Memblock reserved memory + +maintainers: + - Mike Rapoport + +description: | + Memblock can serialize its current memory reservations created with + reserve_mem command line option across kexec through KHO. + The post-KHO kernel can then consume these reservations and they are + guaranteed to have the same physical address. + +properties: + compatible: + enum: + - reserve-mem-v1 + +patternProperties: + "$[0-9a-f_]+^": + $ref: reserve-mem.yaml# + description: reserved memory regions + +required: + - compatible + +additionalProperties: false + +examples: + - | + memblock { + compatible = "memblock-v1"; + n1 { + compatible = "reserve-mem-v1"; + start = <0xc06b 0x4000000>; + size = <0x04 0x00>; + }; + }; diff --git a/Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml b/Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml new file mode 100644 index 0000000000000..10282d3d1bcdc --- /dev/null +++ b/Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +title: Memblock reserved memory regions + +maintainers: + - Mike Rapoport + +description: | + Memblock can serialize its current memory reservations created with + reserve_mem command line option across kexec through KHO. + This object describes each such region. + +properties: + compatible: + enum: + - reserve-mem-v1 + + start: + description: | + physical address (u64) of the reserved memory region. + + size: + description: | + size (u64) of the reserved memory region. + +required: + - compatible + - start + - size + +additionalProperties: false + +examples: + - | + n1 { + compatible = "reserve-mem-v1"; + start = <0xc06b 0x4000000>; + size = <0x04 0x00>; + }; diff --git a/Documentation/core-api/kho/bindings/sub-fdt.yaml b/Documentation/core-api/kho/bindings/sub-fdt.yaml new file mode 100644 index 0000000000000..b9a3d2d248501 --- /dev/null +++ b/Documentation/core-api/kho/bindings/sub-fdt.yaml @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +title: KHO users' FDT address + +maintainers: + - Mike Rapoport + - Changyuan Lyu + +description: | + Physical address of an FDT blob registered by a KHO user. + +properties: + fdt: + description: | + physical address (u64) of an FDT blob. + +required: + - fdt + +additionalProperties: false + +examples: + - | + memblock { + fdt = <0x80cc16 0x1000000>; + }; diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst new file mode 100644 index 0000000000000..f1826ac10da75 --- /dev/null +++ b/Documentation/core-api/kho/concepts.rst @@ -0,0 +1,74 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later +.. _concepts: + +======================= +Kexec Handover Concepts +======================= + +Kexec HandOver (KHO) is a mechanism that allows Linux to preserve memory +regions, which could contain serialized system states, across kexec. + +It introduces multiple concepts: + +KHO FDT +======= + +Every KHO kexec carries a KHO specific flattened device tree (FDT) blob +that describes preserved memory regions. These regions contain either +serialized subsystem states, or in-memory data that shall not be touched +across kexec. After KHO, subsystems can retrieve and restore preserved +memory regions from KHO FDT. + +KHO only uses the FDT container format and libfdt library, but does not +adhere to the same property semantics that normal device trees do: Properties +are passed in native endianness and standardized properties like ``regs`` and +``ranges`` do not exist, hence there are no ``#...-cells`` properties. + +KHO is still under development. The FDT schema is unstable and would change +in the future. + +Scratch Regions +=============== + +To boot into kexec, we need to have a physically contiguous memory range that +contains no handed over memory. Kexec then places the target kernel and initrd +into that region. The new kernel exclusively uses this region for memory +allocations before during boot up to the initialization of the page allocator. + +We guarantee that we always have such regions through the scratch regions: On +first boot KHO allocates several physically contiguous memory regions. Since +after kexec these regions will be used by early memory allocations, there is a +scratch region per NUMA node plus a scratch region to satisfy allocations +requests that do not require particular NUMA node assignment. +By default, size of the scratch region is calculated based on amount of memory +allocated during boot. The ``kho_scratch`` kernel command line option may be +used to explicitly define size of the scratch regions. +The scratch regions are declared as CMA when page allocator is initialized so +that their memory can be used during system lifetime. CMA gives us the +guarantee that no handover pages land in that region, because handover pages +must be at a static physical memory location and CMA enforces that only +movable pages can be located inside. + +After KHO kexec, we ignore the ``kho_scratch`` kernel command line option and +instead reuse the exact same region that was originally allocated. This allows +us to recursively execute any amount of KHO kexecs. Because we used this region +for boot memory allocations and as target memory for kexec blobs, some parts +of that memory region may be reserved. These reservations are irrelevant for +the next KHO, because kexec can overwrite even the original kernel. + +.. _finalization_phase: + +KHO finalization phase +====================== + +To enable user space based kexec file loader, the kernel needs to be able to +provide the FDT that describes the current kernel's state before +performing the actual kexec. The process of generating that FDT is +called serialization. When the FDT is generated, some properties +of the system may become immutable because they are already written down +in the FDT. That state is called the KHO finalization phase. + +Public API +========== +.. kernel-doc:: kernel/kexec_handover.c + :export: diff --git a/Documentation/core-api/kho/fdt.rst b/Documentation/core-api/kho/fdt.rst new file mode 100644 index 0000000000000..4a5d53c670d4b --- /dev/null +++ b/Documentation/core-api/kho/fdt.rst @@ -0,0 +1,80 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +======= +KHO FDT +======= + +KHO uses the flattened device tree (FDT) container format and libfdt +library to create and parse the data that is passed between the +kernels. The properties in KHO FDT are stored in native format. +It includes the physical address of an in-memory structure describing +all preserved memory regions, as well as physical addresses of KHO users' +own FDTs. Interpreting those sub FDTs is the responsibility of KHO users. + +KHO nodes and properties +======================== + +Property ``preserved-memory-map`` +--------------------------------- + +KHO saves a special property named ``preserved-memory-map`` under the root node. +This node contains the physical address of an in-memory structure for KHO to +preserve memory regions across kexec. + +Property ``compatible`` +----------------------- + +The ``compatible`` property determines compatibility between the kernel +that created the KHO FDT and the kernel that attempts to load it. +If the kernel that loads the KHO FDT is not compatible with it, the entire +KHO process will be bypassed. + +Property ``fdt`` +---------------- + +Generally, A KHO user serialize its state into its own FDT and instructs +KHO to preserve the underlying memory, such that after kexec, the new kernel +can recover its state from the preserved FDT. + +A KHO user thus can create a node in KHO root tree and save the physical address +of its own FDT in that node's property ``fdt`` . + +Examples +======== + +The following example demonstrates KHO FDT that preserves two memory +regions created with ``reserve_mem`` kernel command line parameter:: + + /dts-v1/; + + / { + compatible = "kho-v1"; + + preserved-memory-map = <0x40be16 0x1000000>; + + memblock { + fdt = <0x1517 0x1000000>; + }; + }; + +where the ``memblock`` node contains an FDT that is requested by the +subsystem memblock for preservation. The FDT contains the following +serialized data:: + + /dts-v1/; + + / { + compatible = "memblock-v1"; + + n1 { + compatible = "reserve-mem-v1"; + start = <0xc06b 0x4000000>; + size = <0x04 0x00>; + }; + + n2 { + compatible = "reserve-mem-v1"; + start = <0xc067 0x4000000>; + size = <0x04 0x00>; + }; + }; diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst new file mode 100644 index 0000000000000..0c63b0c5c1436 --- /dev/null +++ b/Documentation/core-api/kho/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +======================== +Kexec Handover Subsystem +======================== + +.. toctree:: + :maxdepth: 1 + + concepts + fdt + +.. only:: subproject and html diff --git a/Documentation/userspace-api/mseal.rst b/Documentation/userspace-api/mseal.rst index 1dabfc29be0d1..7195a7f911075 100644 --- a/Documentation/userspace-api/mseal.rst +++ b/Documentation/userspace-api/mseal.rst @@ -27,7 +27,7 @@ SYSCALL ======= mseal syscall signature ----------------------- - ``int mseal(void \* addr, size_t len, unsigned long flags)`` + ``int mseal(void *addr, size_t len, unsigned long flags)`` **addr**/**len**: virtual memory address range. The address range set by **addr**/**len** must meet: diff --git a/MAINTAINERS b/MAINTAINERS index c59316109e3f8..eb294503d54b1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10956,6 +10956,7 @@ F: include/linux/platform_data/huawei-gaokun-ec.h HUGETLB SUBSYSTEM M: Muchun Song +R: Oscar Salvador L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages @@ -12812,6 +12813,7 @@ F: lib/Kconfig.kcsan F: scripts/Makefile.kcsan KDUMP +M: Andrew Morton M: Baoquan He R: Vivek Goyal R: Dave Young @@ -13113,12 +13115,25 @@ F: fs/kernfs/ F: include/linux/kernfs.h KEXEC +M: Andrew Morton +M: Baoquan He L: kexec@lists.infradead.org W: http://kernel.org/pub/linux/utils/kernel/kexec/ F: include/linux/kexec.h F: include/uapi/linux/kexec.h F: kernel/kexec* +KEXEC HANDOVER (KHO) +M: Alexander Graf +M: Mike Rapoport +M: Changyuan Lyu +L: kexec@lists.infradead.org +S: Maintained +F: Documentation/admin-guide/mm/kho.rst +F: Documentation/core-api/kho/* +F: include/linux/kexec_handover.h +F: kernel/kexec_handover.c + KEYS-ENCRYPTED M: Mimi Zohar L: linux-integrity@vger.kernel.org @@ -15416,6 +15431,7 @@ M: Mike Rapoport L: linux-mm@kvack.org S: Maintained F: Documentation/core-api/boot-time-mm.rst +F: Documentation/core-api/kho/bindings/memblock/* F: include/linux/memblock.h F: mm/memblock.c F: mm/mm_init.c @@ -15510,6 +15526,21 @@ F: mm/numa.c F: mm/numa_emulation.c F: mm/numa_memblks.c +MEMORY MANAGEMENT - PAGE ALLOCATOR +M: Andrew Morton +R: Vlastimil Babka +R: Suren Baghdasaryan +R: Michal Hocko +R: Brendan Jackman +R: Johannes Weiner +R: Zi Yan +L: linux-mm@kvack.org +S: Maintained +F: mm/compaction.c +F: mm/page_alloc.c +F: include/linux/gfp.h +F: include/linux/compaction.h + MEMORY MANAGEMENT - SECRETMEM M: Andrew Morton M: Mike Rapoport @@ -15531,16 +15562,31 @@ F: include/uapi/linux/userfaultfd.h F: mm/userfaultfd.c F: tools/testing/selftests/mm/uffd-*.[ch] +MEMORY MANAGEMENT - RUST +M: Alice Ryhl +R: Lorenzo Stoakes +R: Liam R. Howlett +L: linux-mm@kvack.org +L: rust-for-linux@vger.kernel.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: rust/helpers/mm.c +F: rust/kernel/mm.rs +F: rust/kernel/mm/ + MEMORY MAPPING M: Andrew Morton M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Jann Horn +R: Pedro Falcato L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/trace/events/mmap.h F: mm/mlock.c F: mm/mmap.c F: mm/mprotect.c @@ -15549,8 +15595,39 @@ F: mm/mseal.c F: mm/vma.c F: mm/vma.h F: mm/vma_internal.h +F: tools/testing/selftests/mm/merge.c F: tools/testing/vma/ +MEMORY MAPPING - LOCKING +M: Andrew Morton +M: Suren Baghdasaryan +M: Liam R. Howlett +M: Lorenzo Stoakes +R: Vlastimil Babka +R: Shakeel Butt +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/mm/process_addrs.rst +F: include/linux/mmap_lock.h +F: include/trace/events/mmap_lock.h +F: mm/mmap_lock.c + +MEMORY MAPPING - MADVISE (MEMORY ADVICE) +M: Andrew Morton +M: Liam R. Howlett +M: Lorenzo Stoakes +M: David Hildenbrand +R: Vlastimil Babka +R: Jann Horn +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/uapi/asm-generic/mman-common.h +F: mm/madvise.c + MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal M: Richard Weinberger @@ -22239,9 +22316,7 @@ F: drivers/nvmem/layouts/sl28vpd.c SLAB ALLOCATOR M: Christoph Lameter -M: Pekka Enberg M: David Rientjes -M: Joonsoo Kim M: Andrew Morton M: Vlastimil Babka R: Roman Gushchin diff --git a/README b/README index fd903645e6de0..e69de29bb2d1d 100644 --- a/README +++ b/README @@ -1,18 +0,0 @@ -Linux kernel -============ - -There are several guides for kernel developers and users. These guides can -be rendered in a number of formats, like HTML and PDF. Please read -Documentation/admin-guide/README.rst first. - -In order to build the documentation, use ``make htmldocs`` or -``make pdfdocs``. The formatted documentation can also be read online at: - - https://www.kernel.org/doc/html/latest/ - -There are various text files in the Documentation/ subdirectory, -several of them using the reStructuredText markup notation. - -Please read the Documentation/process/changes.rst file, as it contains the -requirements for building and running the kernel, and information about -the problems which may result by upgrading your kernel. diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 02e8817a89212..2676017f42f17 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -192,13 +192,6 @@ extern unsigned long __zero_page(void); #define pte_pfn(pte) (pte_val(pte) >> PFN_PTE_SHIFT) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define mk_pte(page, pgprot) \ -({ \ - pte_t pte; \ - \ - pte_val(pte) = (page_to_pfn(page) << 32) | pgprot_val(pgprot); \ - pte; \ -}) extern inline pte_t pfn_pte(unsigned long physpfn, pgprot_t pgprot) { pte_t pte; pte_val(pte) = (PHYS_TWIDDLE(physpfn) << 32) | pgprot_val(pgprot); return pte; } diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 8a2441670a8f6..7765dc105d546 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -40,8 +40,6 @@ static inline pmd_t pte_pmd(pte_t pte) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) -#define mk_pmd(page, prot) pte_pmd(mk_pte(page, prot)) - #define pmd_trans_huge(pmd) (pmd_val(pmd) & _PAGE_HW_SZ) #define pfn_pmd(pfn, prot) (__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))) diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h index 86e1482264630..d1ce4b0f1071b 100644 --- a/arch/arc/include/asm/pgtable-levels.h +++ b/arch/arc/include/asm/pgtable-levels.h @@ -142,7 +142,6 @@ #define pmd_pfn(pmd) ((pmd_val(pmd) & PMD_MASK) >> PAGE_SHIFT) #define pfn_pmd(pfn,prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) #endif @@ -177,7 +176,6 @@ #define set_pte(ptep, pte) ((*(ptep)) = (pte)) #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) -#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) #ifdef CONFIG_ISA_ARCV2 #define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ) diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h index 9709256e31c8e..728d625a10f1e 100644 --- a/arch/arc/include/asm/syscall.h +++ b/arch/arc/include/asm/syscall.h @@ -23,6 +23,17 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return -1; } +static inline void +syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->r8 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -67,6 +78,20 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, } } +static inline void +syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + unsigned long *args) +{ + unsigned long *inside_ptregs = ®s->r0; + unsigned int n = 6; + unsigned int i = 0; + + while (n--) { + *inside_ptregs = args[i++]; + inside_ptregs--; + } +} + static inline int syscall_get_arch(struct task_struct *task) { diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index fa5939eb9864e..7b71a3d414b72 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -209,7 +209,6 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmd_pfn(pmd) (((pmd_val(pmd) & PMD_MASK) & PHYS_MASK) >> PAGE_SHIFT) #define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) -#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) /* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */ #define pmdp_establish generic_pmdp_establish diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 6b986ef6042f8..7f1c3b4e3e044 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -168,7 +168,6 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) #define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define mk_pte(page,prot) pfn_pte(page_to_pfn(page), prot) #define pte_clear(mm,addr,ptep) set_pte_ext(ptep, __pte(0), 0) diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index fe4326d938c18..18b102a307419 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -68,6 +68,30 @@ static inline void syscall_set_return_value(struct task_struct *task, regs->ARM_r0 = (long) error ? error : val; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + if (nr == -1) { + task_thread_info(task)->abi_syscall = -1; + /* + * When the syscall number is set to -1, the syscall will be + * skipped. In this case the syscall return value has to be + * set explicitly, otherwise the first syscall argument is + * returned as the syscall return value. + */ + syscall_set_return_value(task, regs, -ENOSYS, 0); + return; + } + if ((IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT))) { + task_thread_info(task)->abi_syscall = nr; + return; + } + task_thread_info(task)->abi_syscall = + (task_thread_info(task)->abi_syscall & ~__NR_SYSCALL_MASK) | + (nr & __NR_SYSCALL_MASK); +} + #define SYSCALL_MAX_ARGS 7 static inline void syscall_get_arguments(struct task_struct *task, @@ -80,6 +104,19 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->ARM_r0 + 1, 5 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->ARM_r0, args, 6 * sizeof(args[0])); + /* + * Also copy the first argument into ARM_ORIG_r0 + * so that syscall_get_arguments() would return it + * instead of the previous value. + */ + regs->ARM_ORIG_r0 = regs->ARM_r0; +} + static inline int syscall_get_arch(struct task_struct *task) { /* ARM tasks don't change audit architectures on the fly. */ diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index f02f872ea8a9e..edb7f56b7c910 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -735,7 +735,7 @@ static void *__init late_alloc(unsigned long sz) void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, get_order(sz)); - if (!ptdesc || !pagetable_pte_ctor(ptdesc)) + if (!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)) BUG(); return ptdesc_to_virt(ptdesc); } diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index f5f790c6e5f89..3d96fb41d6245 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -26,11 +26,11 @@ bool is_swbp_insn(uprobe_opcode_t *insn) (UPROBE_SWBP_ARM_INSN & 0x0fffffff); } -int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + return uprobe_write_opcode(auprobe, vma, vaddr, + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a182295e6f08b..34c79f4fee3f9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1602,6 +1602,9 @@ config ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG def_bool y +config ARCH_SUPPORTS_KEXEC_HANDOVER + def_bool y + config ARCH_SUPPORTS_CRASH_DUMP def_bool y diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index 6d6d4065b0cb4..265e8301d7ba8 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -11,11 +11,19 @@ #include -typedef u64 pteval_t; -typedef u64 pmdval_t; -typedef u64 pudval_t; -typedef u64 p4dval_t; -typedef u64 pgdval_t; +/* + * Page Table Descriptor + * + * Generic page table descriptor format from which + * all level specific descriptors can be derived. + */ +typedef u64 ptdesc_t; + +typedef ptdesc_t pteval_t; +typedef ptdesc_t pmdval_t; +typedef ptdesc_t pudval_t; +typedef ptdesc_t p4dval_t; +typedef ptdesc_t pgdval_t; /* * These are used to make use of C type-checking.. @@ -46,7 +54,7 @@ typedef struct { pgdval_t pgd; } pgd_t; #define pgd_val(x) ((x).pgd) #define __pgd(x) ((pgd_t) { (x) } ) -typedef struct { pteval_t pgprot; } pgprot_t; +typedef struct { ptdesc_t pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d3b538be1500b..2a77f11b78d53 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -609,7 +609,6 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) #define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) @@ -813,12 +812,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) /* use ONLY for statically allocated translation tables */ #define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot) - #if CONFIG_PGTABLE_LEVELS > 2 #define pmd_ERROR(e) \ diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index b2931d1ae0fb3..fded5358641f8 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -24,8 +24,8 @@ struct ptdump_info { }; struct ptdump_prot_bits { - u64 mask; - u64 val; + ptdesc_t mask; + ptdesc_t val; const char *set; const char *clear; }; @@ -34,7 +34,7 @@ struct ptdump_pg_level { const struct ptdump_prot_bits *bits; char name[4]; int num; - u64 mask; + ptdesc_t mask; }; /* @@ -51,7 +51,7 @@ struct ptdump_pg_state { const struct mm_struct *mm; unsigned long start_address; int level; - u64 current_prot; + ptdesc_t current_prot; bool check_wx; unsigned long wx_pages; unsigned long uxn_pages; @@ -59,7 +59,13 @@ struct ptdump_pg_state { void ptdump_walk(struct seq_file *s, struct ptdump_info *info); void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val); + pteval_t val); +void note_page_pte(struct ptdump_state *st, unsigned long addr, pte_t pte); +void note_page_pmd(struct ptdump_state *st, unsigned long addr, pmd_t pmd); +void note_page_pud(struct ptdump_state *st, unsigned long addr, pud_t pud); +void note_page_p4d(struct ptdump_state *st, unsigned long addr, p4d_t p4d); +void note_page_pgd(struct ptdump_state *st, unsigned long addr, pgd_t pgd); +void note_page_flush(struct ptdump_state *st); #ifdef CONFIG_PTDUMP_DEBUGFS #define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64 void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); @@ -69,7 +75,13 @@ static inline void ptdump_debugfs_register(struct ptdump_info *info, #endif /* CONFIG_PTDUMP_DEBUGFS */ #else static inline void note_page(struct ptdump_state *pt_st, unsigned long addr, - int level, u64 val) { } + int level, pteval_t val) { } +static inline void note_page_pte(struct ptdump_state *st, unsigned long addr, pte_t pte) { } +static inline void note_page_pmd(struct ptdump_state *st, unsigned long addr, pmd_t pmd) { } +static inline void note_page_pud(struct ptdump_state *st, unsigned long addr, pud_t pud) { } +static inline void note_page_p4d(struct ptdump_state *st, unsigned long addr, p4d_t p4d) { } +static inline void note_page_pgd(struct ptdump_state *st, unsigned long addr, pgd_t pgd) { } +static inline void note_page_flush(struct ptdump_state *st) { } #endif /* CONFIG_PTDUMP */ #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index ab8e14b96f681..712daa90e6431 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -61,6 +61,22 @@ static inline void syscall_set_return_value(struct task_struct *task, regs->regs[0] = val; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->syscallno = nr; + if (nr == -1) { + /* + * When the syscall number is set to -1, the syscall will be + * skipped. In this case the syscall return value has to be + * set explicitly, otherwise the first syscall argument is + * returned as the syscall return value. + */ + syscall_set_return_value(task, regs, -ENOSYS, 0); + } +} + #define SYSCALL_MAX_ARGS 6 static inline void syscall_get_arguments(struct task_struct *task, @@ -73,6 +89,19 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->regs[1], 5 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->regs[0], args, 6 * sizeof(args[0])); + /* + * Also copy the first argument into orig_x0 + * so that syscall_get_arguments() would return it + * instead of the previous value. + */ + regs->orig_x0 = regs->regs[0]; +} + /* * We don't care about endianness (__AUDIT_ARCH_LE bit) here because * AArch64 has the same system calls both on little- and big- endian. diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 1d25d8899dbf7..42e281c07c2f1 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -29,7 +29,7 @@ static bool region_is_misaligned(const efi_memory_desc_t *md) * executable, everything else can be mapped with the XN bits * set. Also take the new (optional) RO/XP bits into account. */ -static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) +static __init ptdesc_t create_mapping_protection(efi_memory_desc_t *md) { u64 attr = md->attribute; u32 type = md->type; @@ -83,7 +83,7 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { - pteval_t prot_val = create_mapping_protection(md); + ptdesc_t prot_val = create_mapping_protection(md); bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e57b043f324b5..a00f57c73d818 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -159,7 +159,7 @@ static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) static void __init remap_idmap_for_lpa2(void) { /* clear the bits that change meaning once LPA2 is turned on */ - pteval_t mask = PTE_SHARED; + ptdesc_t mask = PTE_SHARED; /* * We have to clear bits [9:8] in all block or page descriptors in the diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 81345f68f9fc0..7982788e7b9aa 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -30,7 +30,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; - pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; int lshift = (3 - level) * PTDESC_TABLE_SHIFT; u64 lmask = (PAGE_SIZE << lshift) - 1; @@ -87,7 +87,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, } } -asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask) +asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) { u64 ptep = (u64)pg_dir + PAGE_SIZE; pgprot_t text_prot = PAGE_KERNEL_ROX; diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index c91e5e965cd39..91dcb5b6bbd1f 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -33,4 +33,4 @@ void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, asmlinkage void early_map_kernel(u64 boot_status, void *fdt); -asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask); +asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 07aeab8a76065..c86c348857c40 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -83,7 +83,7 @@ arch_initcall(adjust_protection_map); pgprot_t vm_get_page_prot(unsigned long vm_flags) { - pteval_t prot; + ptdesc_t prot; /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ea6695d53fb96..8fcf59ba39db7 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -46,6 +46,13 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ +enum pgtable_type { + TABLE_PTE, + TABLE_PMD, + TABLE_PUD, + TABLE_P4D, +}; + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -107,7 +114,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(int shift) +static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) { phys_addr_t phys; @@ -192,7 +199,7 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -207,7 +214,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(PAGE_SHIFT); + pte_phys = pgtable_alloc(TABLE_PTE); ptep = pte_set_fixmap(pte_phys); init_clear_pgtable(ptep); ptep += pte_index(addr); @@ -243,7 +250,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -277,7 +284,8 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_type), + int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); @@ -294,7 +302,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(PMD_SHIFT); + pmd_phys = pgtable_alloc(TABLE_PMD); pmdp = pmd_set_fixmap(pmd_phys); init_clear_pgtable(pmdp); pmdp += pmd_index(addr); @@ -325,7 +333,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -339,7 +347,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(PUD_SHIFT); + pud_phys = pgtable_alloc(TABLE_PUD); pudp = pud_set_fixmap(pud_phys); init_clear_pgtable(pudp); pudp += pud_index(addr); @@ -383,7 +391,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -397,7 +405,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); - p4d_phys = pgtable_alloc(P4D_SHIFT); + p4d_phys = pgtable_alloc(TABLE_P4D); p4dp = p4d_set_fixmap(p4d_phys); init_clear_pgtable(p4dp); p4dp += p4d_index(addr); @@ -427,7 +435,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long addr, end, next; @@ -455,7 +463,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { mutex_lock(&fixmap_lock); @@ -468,37 +476,48 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, extern __alias(__create_pgd_mapping_locked) void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags); + phys_addr_t (*pgtable_alloc)(enum pgtable_type), + int flags); #endif -static phys_addr_t __pgd_pgtable_alloc(int shift) +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, + enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); + struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); + phys_addr_t pa; + + BUG_ON(!ptdesc); + pa = page_to_phys(ptdesc_page(ptdesc)); + + switch (pgtable_type) { + case TABLE_PTE: + BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); + break; + case TABLE_PMD: + BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); + break; + case TABLE_PUD: + pagetable_pud_ctor(ptdesc); + break; + case TABLE_P4D: + pagetable_p4d_ctor(ptdesc); + break; + } - BUG_ON(!ptr); - return __pa(ptr); + return pa; } -static phys_addr_t pgd_pgtable_alloc(int shift) +static phys_addr_t __maybe_unused +pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { - phys_addr_t pa = __pgd_pgtable_alloc(shift); - struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); - - /* - * Call proper page table ctor in case later we need to - * call core mm functions like apply_to_page_range() on - * this pre-allocated page table. - * - * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pagetable_pte_ctor() becomes nop. - */ - if (shift == PAGE_SHIFT) - BUG_ON(!pagetable_pte_ctor(ptdesc)); - else if (shift == PMD_SHIFT) - BUG_ON(!pagetable_pmd_ctor(ptdesc)); + return __pgd_pgtable_alloc(&init_mm, pgtable_type); +} - return pa; +static phys_addr_t +pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) +{ + return __pgd_pgtable_alloc(NULL, pgtable_type); } /* @@ -530,7 +549,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, flags); + pgd_pgtable_alloc_special_mm, flags); } static void update_mapping_prot(phys_addr_t phys, unsigned long virt, @@ -744,7 +763,7 @@ static int __init map_entry_trampoline(void) memset(tramp_pg_dir, 0, PGD_SIZE); __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, entry_tramp_text_size(), prot, - __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); + pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) @@ -1350,7 +1369,7 @@ int arch_add_memory(int nid, u64 start, u64 size, flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, params->pgprot, __pgd_pgtable_alloc, + size, params->pgprot, pgd_pgtable_alloc_init_mm, flags); memblock_clear_nomap(start, size); diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 8cec0da4cff28..421a5de806c62 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -189,12 +189,12 @@ static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) } void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) + pteval_t val) { struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; - u64 prot = 0; + ptdesc_t prot = 0; /* check if the current level has been folded dynamically */ if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || @@ -251,6 +251,38 @@ void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } +void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; @@ -266,7 +298,12 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]){ {info->base_addr, end}, {0, 0} @@ -303,7 +340,12 @@ bool ptdump_check_wx(void) .level = -1, .check_wx = true, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {_PAGE_OFFSET(vabits_actual), ~0UL}, {0, 0} diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 11055c5749686..9ed2b15ffd940 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) pte_t *pte; unsigned long i; - pte = (pte_t *) __get_free_page(GFP_KERNEL); + pte = __pte_alloc_one_kernel(mm); if (!pte) return NULL; diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index a397e1718ab67..b8378431aeff7 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -249,11 +249,6 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot) return __pgprot(prot); } -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h index 0de5734950bf9..717f44b4d26f5 100644 --- a/arch/csky/include/asm/syscall.h +++ b/arch/csky/include/asm/syscall.h @@ -59,6 +59,19 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, memcpy(args, ®s->a1, 5 * sizeof(args[0])); } +static inline void +syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->a0, args, 6 * sizeof(regs->a0)); + /* + * Also copy the first argument into orig_a0 + * so that syscall_get_arguments() would return it + * instead of the previous value. + */ + regs->orig_a0 = regs->a0; +} + static inline int syscall_get_arch(struct task_struct *task) { diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 8c5b7a1c3d909..9fbdfdbc539f8 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -238,9 +238,6 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & _PAGE_PRESENT; } -/* mk_pte - make a PTE out of a page pointer and protection bits */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - /* pte_page - returns a page (frame pointer/descriptor?) based on a PTE */ #define pte_page(x) pfn_to_page(pte_pfn(x)) diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index f6e454f18038f..70637261817af 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -26,6 +26,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->r06; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->r06 = nr; +} + static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) @@ -33,6 +40,13 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, &(®s->r00)[0], 6 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + memcpy(&(®s->r00)[0], args, 6 * sizeof(args[0])); +} + static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { @@ -45,6 +59,13 @@ static inline long syscall_get_return_value(struct task_struct *task, return regs->r00; } +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->r00 = (long) error ?: val; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_HEXAGON; diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index b58f587f0f0ad..1c63a9d9a6d35 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -69,7 +69,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) if (!ptdesc) return NULL; - if (!pagetable_pmd_ctor(ptdesc)) { + if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index da346733a1dae..a3f17914dbab1 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -255,7 +255,6 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_page_vaddr(pmd) pmd_val(pmd) -extern pmd_t mk_pmd(struct page *page, pgprot_t prot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -426,12 +425,6 @@ static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a) return false; } -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | diff --git a/arch/loongarch/include/asm/syscall.h b/arch/loongarch/include/asm/syscall.h index e286dc58476e6..81d2733f7b949 100644 --- a/arch/loongarch/include/asm/syscall.h +++ b/arch/loongarch/include/asm/syscall.h @@ -26,6 +26,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->regs[11]; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->regs[11] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -61,6 +68,14 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(&args[1], ®s->regs[5], 5 * sizeof(long)); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + regs->orig_a0 = args[0]; + memcpy(®s->regs[5], &args[1], 5 * sizeof(long)); +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_LOONGARCH64; diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 22a94bb3e6e8b..352d9b2e02ab5 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -135,15 +135,6 @@ void kernel_pte_init(void *addr) } while (p != end); } -pmd_t mk_pmd(struct page *page, pgprot_t prot) -{ - pmd_t pmd; - - pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); - - return pmd; -} - void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 4c648b51e7fd5..fc5454d37da31 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -7,7 +7,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - pagetable_free(virt_to_ptdesc(pte)); + pagetable_dtor_free(virt_to_ptdesc(pte)); } extern const char bad_pmd_string[]; @@ -19,6 +19,10 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) if (!ptdesc) return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } return ptdesc_address(ptdesc); } @@ -48,7 +52,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(ptdesc)) { + if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 48f87a8a88320..f5c596b211d42 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -96,12 +96,6 @@ #define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte_val(pte) = (pte_val(pte) & CF_PAGE_CHG_MASK) | pgprot_val(newprot); diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 5abe7da8ac5ab..1091fb0affbee 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -15,7 +15,7 @@ enum m68k_table_types { }; extern void init_pointer_table(void *table, int type); -extern void *get_pointer_table(int type); +extern void *get_pointer_table(struct mm_struct *mm, int type); extern int free_pointer_table(void *table, int type); /* @@ -26,7 +26,7 @@ extern int free_pointer_table(void *table, int type); static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return get_pointer_table(TABLE_PTE); + return get_pointer_table(mm, TABLE_PTE); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) @@ -36,7 +36,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - return get_pointer_table(TABLE_PTE); + return get_pointer_table(mm, TABLE_PTE); } static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) @@ -53,7 +53,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - return get_pointer_table(TABLE_PMD); + return get_pointer_table(mm, TABLE_PMD); } static inline int pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -75,7 +75,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return get_pointer_table(TABLE_PGD); + return get_pointer_table(mm, TABLE_PGD); } diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 9866c7acdabe1..040ac3bad713c 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -81,12 +81,6 @@ extern unsigned long mm_cachebits; #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot); diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 30081aee81643..73745dc0ec0e0 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -76,12 +76,6 @@ #ifndef __ASSEMBLY__ -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte_val(pte) = (pte_val(pte) & SUN3_PAGE_CHG_MASK) | pgprot_val(newprot); diff --git a/arch/m68k/include/asm/syscall.h b/arch/m68k/include/asm/syscall.h index d1453e850cddb..bf84b160c2eb3 100644 --- a/arch/m68k/include/asm/syscall.h +++ b/arch/m68k/include/asm/syscall.h @@ -14,6 +14,13 @@ static inline int syscall_get_nr(struct task_struct *task, return regs->orig_d0; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_d0 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 73651e093c4dc..6ab3ef39ba7a9 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -139,7 +139,7 @@ void __init init_pointer_table(void *table, int type) return; } -void *get_pointer_table(int type) +void *get_pointer_table(struct mm_struct *mm, int type) { ptable_desc *dp = ptable_list[type].next; unsigned int mask = list_empty(&ptable_list[type]) ? 0 : PD_MARKBITS(dp); @@ -164,10 +164,10 @@ void *get_pointer_table(int type) * m68k doesn't have SPLIT_PTE_PTLOCKS for not having * SMP. */ - pagetable_pte_ctor(virt_to_ptdesc(page)); + pagetable_pte_ctor(mm, virt_to_ptdesc(page)); break; case TABLE_PMD: - pagetable_pmd_ctor(virt_to_ptdesc(page)); + pagetable_pmd_ctor(mm, virt_to_ptdesc(page)); break; case TABLE_PGD: pagetable_pgd_ctor(virt_to_ptdesc(page)); diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index e4ea2ec3642f0..b1bb2c65dd047 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -285,14 +285,6 @@ static inline pte_t mk_pte_phys(phys_addr_t physpage, pgprot_t pgprot) return pte; } -#define mk_pte(page, pgprot) \ -({ \ - pte_t pte; \ - pte_val(pte) = (((page - mem_map) << PAGE_SHIFT) + memory_start) | \ - pgprot_val(pgprot); \ - pte; \ -}) - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot); diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h index 5eb3f624cc590..b5b6b91fae3e9 100644 --- a/arch/microblaze/include/asm/syscall.h +++ b/arch/microblaze/include/asm/syscall.h @@ -14,6 +14,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->r12; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->r12 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 9f73265aad4e8..e96dd1b7aba49 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -245,7 +245,7 @@ unsigned long iopa(unsigned long addr) __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { if (mem_init_done) - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + return __pte_alloc_one_kernel(mm); else return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, MEMBLOCK_LOW_LIMIT, diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index bbca420c96d3c..942af87f1cddb 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -62,7 +62,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) if (!ptdesc) return NULL; - if (!pagetable_pmd_ctor(ptdesc)) { + if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index c29a551eb0ca8..4852b005a72d3 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -504,12 +504,6 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, return true; } -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - #if defined(CONFIG_XPA) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { @@ -719,9 +713,6 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ -/* Extern to avoid header file madness */ -extern pmd_t mk_pmd(struct page *page, pgprot_t prot); - static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmd_val(pmd) = (pmd_val(pmd) & (_PAGE_CHG_MASK | _PAGE_HUGE)) | diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 056aa1b713e23..d19e67e2aa6a1 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -41,6 +41,21 @@ static inline long syscall_get_nr(struct task_struct *task, return task_thread_info(task)->syscall; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * New syscall number has to be assigned to regs[2] because + * it is loaded from there unconditionally after return from + * syscall_trace_enter() invocation. + * + * Consequently, if the syscall was indirect and nr != __NR_syscall, + * then after this assignment the syscall will cease to be indirect. + */ + task_thread_info(task)->syscall = regs->regs[2] = nr; +} + static inline void mips_syscall_update_nr(struct task_struct *task, struct pt_regs *regs) { @@ -74,6 +89,23 @@ static inline void mips_get_syscall_arg(unsigned long *arg, #endif } +static inline void mips_set_syscall_arg(unsigned long *arg, + struct task_struct *task, struct pt_regs *regs, unsigned int n) +{ +#ifdef CONFIG_32BIT + switch (n) { + case 0: case 1: case 2: case 3: + regs->regs[4 + n] = *arg; + return; + case 4: case 5: case 6: case 7: + *arg = regs->args[n] = *arg; + return; + } +#else + regs->regs[4 + n] = *arg; +#endif +} + static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { @@ -120,6 +152,17 @@ static inline void syscall_get_arguments(struct task_struct *task, mips_get_syscall_arg(args++, task, regs, i++); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + unsigned int i = 0; + unsigned int n = 6; + + while (n--) + mips_set_syscall_arg(args++, task, regs, i++); +} + extern const unsigned long sys_call_table[]; extern const unsigned long sys32_call_table[]; extern const unsigned long sysn32_call_table[]; diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c index 84dd5136d53a6..e2cf2166d5cb3 100644 --- a/arch/mips/mm/pgtable-32.c +++ b/arch/mips/mm/pgtable-32.c @@ -31,16 +31,6 @@ void pgd_init(void *addr) } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) -pmd_t mk_pmd(struct page *page, pgprot_t prot) -{ - pmd_t pmd; - - pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); - - return pmd; -} - - void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index 1e544827dea9d..b24f865de357c 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -90,15 +90,6 @@ void pud_init(void *addr) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t mk_pmd(struct page *page, pgprot_t prot) -{ - pmd_t pmd; - - pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); - - return pmd; -} - void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index eab87c6beacb5..f490f2fa0dca0 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -217,12 +217,6 @@ static inline void pte_clear(struct mm_struct *mm, set_pte(ptep, null); } -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, prot) (pfn_pte(page_to_pfn(page), prot)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h index fff52205fb652..8e3eb1d689bbe 100644 --- a/arch/nios2/include/asm/syscall.h +++ b/arch/nios2/include/asm/syscall.h @@ -15,6 +15,11 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->r2; } +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + regs->r2 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -58,6 +63,17 @@ static inline void syscall_get_arguments(struct task_struct *task, *args = regs->r9; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, const unsigned long *args) +{ + regs->r4 = *args++; + regs->r5 = *args++; + regs->r6 = *args++; + regs->r7 = *args++; + regs->r8 = *args++; + regs->r9 = *args; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_NIOS2; diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 60c6ce7ff2dcf..71bfb8c8c482a 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -299,8 +299,6 @@ static inline pte_t __mk_pte(void *page, pgprot_t pgprot) return pte; } -#define mk_pte(page, pgprot) __mk_pte(page_address(page), (pgprot)) - #define mk_pte_phys(physpage, pgprot) \ ({ \ pte_t __pte; \ diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h index 903ed882bdecf..5e037d9659c5b 100644 --- a/arch/openrisc/include/asm/syscall.h +++ b/arch/openrisc/include/asm/syscall.h @@ -25,6 +25,12 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_gpr11; } +static inline void +syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + regs->orig_gpr11 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -57,6 +63,13 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, memcpy(args, ®s->gpr[3], 6 * sizeof(args[0])); } +static inline void +syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->gpr[3], args, 6 * sizeof(args[0])); +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_OPENRISC; diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index 8e63e86251ca6..3b352f97fecb0 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -36,7 +36,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm) pte_t *pte; if (likely(mem_init_done)) { - pte = (pte_t *)get_zeroed_page(GFP_KERNEL); + pte = __pte_alloc_one_kernel(mm); } else { pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index 2ca74a56415ca..3b84ee93edaa1 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -39,7 +39,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER); if (!ptdesc) return NULL; - if (!pagetable_pmd_ctor(ptdesc)) { + if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index babf65751e818..e85963fefa638 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -338,10 +338,6 @@ static inline pte_t pte_mkspecial(pte_t pte) { pte_val(pte) |= _PAGE_SPECIAL; re #endif -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ #define __mk_pte(addr,pgprot) \ ({ \ pte_t __pte; \ @@ -351,8 +347,6 @@ static inline pte_t pte_mkspecial(pte_t pte) { pte_val(pte) |= _PAGE_SPECIAL; re __pte; \ }) -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) { pte_t pte; diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h index 00b127a5e09ba..c11222798ab2f 100644 --- a/arch/parisc/include/asm/syscall.h +++ b/arch/parisc/include/asm/syscall.h @@ -17,6 +17,13 @@ static inline long syscall_get_nr(struct task_struct *tsk, return regs->gr[20]; } +static inline void syscall_set_nr(struct task_struct *tsk, + struct pt_regs *regs, + int nr) +{ + regs->gr[20] = nr; +} + static inline void syscall_get_arguments(struct task_struct *tsk, struct pt_regs *regs, unsigned long *args) @@ -29,6 +36,18 @@ static inline void syscall_get_arguments(struct task_struct *tsk, args[0] = regs->gr[26]; } +static inline void syscall_set_arguments(struct task_struct *tsk, + struct pt_regs *regs, + unsigned long *args) +{ + regs->gr[21] = args[5]; + regs->gr[22] = args[4]; + regs->gr[23] = args[3]; + regs->gr[24] = args[2]; + regs->gr[25] = args[1]; + regs->gr[26] = args[0]; +} + static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 6d98e6f08d4de..6ed93e290c2fe 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1096,7 +1096,6 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); -extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); extern pud_t pud_modify(pud_t pud, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 2f72ad885332e..93d77ad5a92fa 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -53,9 +53,8 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, #define MAX_PTRS_PER_PGD PTRS_PER_PGD #endif -/* Keep these as a macros to avoid include dependency mess */ +/* Keep this as a macro to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) static inline unsigned long pte_pfn(pte_t pte) { diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 3dd36c5e334a9..4b3c52ed6e9d2 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -39,6 +39,16 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return -1; } +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->gpr[0] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -110,6 +120,16 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->gpr[3], args, 6 * sizeof(args[0])); + + /* Also copy the first argument into orig_gpr3 */ + regs->orig_gpr3 = args[0]; +} + static inline int syscall_get_arch(struct task_struct *task) { if (is_tsk_32bit_task(task)) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 8f7d41ce2ca18..0db01e10a3f84 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -269,11 +269,6 @@ pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); } -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { unsigned long pmdv; @@ -422,7 +417,7 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; - if (!pagetable_pmd_ctor(ptdesc)) { + if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 713268ccb1a0e..77e55eac16e42 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -56,19 +56,17 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; struct ptdesc *ptdesc; + gfp_t gfp = PGALLOC_GFP; - if (!kernel) { - ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0); - if (!ptdesc) - return NULL; - if (!pagetable_pte_ctor(ptdesc)) { - pagetable_free(ptdesc); - return NULL; - } - } else { - ptdesc = pagetable_alloc(PGALLOC_GFP, 0); - if (!ptdesc) - return NULL; + if (!kernel) + gfp |= __GFP_ACCOUNT; + + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; } atomic_set(&ptdesc->pt_frag_refcount, 1); @@ -124,12 +122,10 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { - if (kernel) - pagetable_free(ptdesc); - else if (folio_test_clear_active(ptdesc_folio(ptdesc))) - call_rcu(&ptdesc->pt_rcu_head, pte_free_now); - else + if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc))) pte_free_now(&ptdesc->pt_rcu_head); + else + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); } } diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 9dc239967b77f..b2358d794855c 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -298,6 +298,38 @@ static void populate_markers(void) #endif } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { @@ -305,7 +337,12 @@ static int ptdump_show(struct seq_file *m, void *v) .marker = address_markers, .level = -1, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = ptdump_range, } }; @@ -338,7 +375,12 @@ bool ptdump_check_wx(void) .level = -1, .check_wx = true, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = ptdump_range, } }; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index b906d28f74fd4..42ff4d167acc5 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2239,6 +2239,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, struct pt_regs *regs) { u64 period = event->hw.sample_period; + const u64 last_period = event->hw.last_period; s64 prev, delta, left; int record = 0; @@ -2320,7 +2321,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (record) { struct perf_sample_data data; - perf_sample_data_init(&data, ~0ULL, event->hw.last_period); + perf_sample_data_init(&data, ~0ULL, last_period); if (event->attr.sample_type & PERF_SAMPLE_ADDR_TYPE) perf_get_data_addr(event, regs, &data.addr); diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c index 1a53ab08447cb..d2ffcc7021c51 100644 --- a/arch/powerpc/perf/core-fsl-emb.c +++ b/arch/powerpc/perf/core-fsl-emb.c @@ -590,6 +590,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, struct pt_regs *regs) { u64 period = event->hw.sample_period; + const u64 last_period = event->hw.last_period; s64 prev, delta, left; int record = 0; @@ -632,7 +633,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (record) { struct perf_sample_data data; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); if (perf_event_overflow(event, &data, regs)) fsl_emb_pmu_stop(event, 0); diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 0897dd99ab8d5..188fadc1c21f9 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -262,8 +262,6 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) return __page_val_to_pfn(pmd_val(pmd)); } -#define mk_pmd(page, prot) pfn_pmd(page_to_pfn(page), prot) - #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 428e48e5f57d0..f19240fd018e1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -343,8 +343,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) return __pte((pfn << _PAGE_PFN_SHIFT) | prot_val); } -#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) - #define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index 121fff429dce6..70ec19dc85061 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -30,6 +30,13 @@ static inline int syscall_get_nr(struct task_struct *task, return regs->a7; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->a7 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -66,6 +73,18 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->a1, 5 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->orig_a0 = args[0]; + regs->a1 = args[1]; + regs->a2 = args[2]; + regs->a3 = args[3]; + regs->a4 = args[4]; + regs->a5 = args[5]; +} + static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_64BIT diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index ab475ec6ca429..8d0374d7ce8ed 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -442,7 +442,12 @@ static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc)); + /* + * We do not know which mm the PTE page is associated to at this point. + * Passing NULL to the ctor is the safe option, though it may result + * in unnecessary work (e.g. initialising the ptlock for init_mm). + */ + BUG_ON(!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)); return __pa((pte_t *)ptdesc_address(ptdesc)); } @@ -522,7 +527,8 @@ static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc)); + /* See comment in alloc_pte_late() regarding NULL passed the ctor */ + BUG_ON(!ptdesc || !pagetable_pmd_ctor(NULL, ptdesc)); return __pa((pmd_t *)ptdesc_address(ptdesc)); } @@ -584,11 +590,11 @@ static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) static phys_addr_t __meminit alloc_pud_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_pud_ctor(ptdesc); + return __pa((pud_t *)ptdesc_address(ptdesc)); } static p4d_t *__init get_p4d_virt_early(phys_addr_t pa) @@ -622,11 +628,11 @@ static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va) static phys_addr_t __meminit alloc_p4d_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_p4d_ctor(ptdesc); + return __pa((p4d_t *)ptdesc_address(ptdesc)); } static void __meminit create_pud_mapping(pud_t *pudp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 9d5f657a251b3..32922550a50a3 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -318,6 +318,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, } } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) { struct pg_state st = { @@ -325,7 +357,12 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) .marker = pinfo->markers, .level = -1, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {pinfo->base_addr, pinfo->end}, {0, 0} @@ -347,7 +384,12 @@ bool ptdump_check_wx(void) .level = -1, .check_wx = true, .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {KERN_VIRT_START, ULONG_MAX}, {0, 0} diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 005497ffebda1..5345398df6534 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -97,7 +97,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { + if (!pagetable_pmd_ctor(mm, virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index f8a6b54986ecb..1c661ac62ce81 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1448,16 +1448,6 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) return pte_mkyoung(__pte); } -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) -{ - unsigned long physpage = page_to_phys(page); - pte_t __pte = mk_pte_phys(physpage, pgprot); - - if (pte_write(__pte) && PageDirty(page)) - __pte = pte_mkdirty(__pte); - return __pte; -} - #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) @@ -1879,7 +1869,6 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, #define pmdp_collapse_flush pmdp_collapse_flush #define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot)) -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) static inline int pmd_trans_huge(pmd_t pmd) { diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 0213ec800b576..bd4cb00ccd5e3 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -24,6 +24,18 @@ static inline long syscall_get_nr(struct task_struct *task, (regs->int_code & 0xffff) : -1; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->int_code = (regs->int_code & ~0xffff) | (nr & 0xffff); +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -76,6 +88,15 @@ static inline void syscall_get_arguments(struct task_struct *task, args[0] = regs->orig_gpr2 & mask; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->orig_gpr2 = args[0]; + for (int n = 1; n < 6; n++) + regs->gprs[2 + n] = args[n]; +} + static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_COMPAT diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index d3e943752fa0a..ac604b1766609 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -147,11 +147,48 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -190,7 +227,12 @@ static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e3a6f8ae156cd..619d6917e3b74 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -145,7 +145,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(ptdesc)) { + if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index f939f1215232c..db2e48366e0d5 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -380,14 +380,6 @@ PTE_BIT_FUNC(low, mkspecial, |= _PAGE_SPECIAL); #define pgprot_noncached pgprot_writecombine -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * extern pte_t mk_pte(struct page *page, pgprot_t pgprot) - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte.pte_low &= _PAGE_CHG_MASK; diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h index d87738eebe30c..7027d87d901df 100644 --- a/arch/sh/include/asm/syscall_32.h +++ b/arch/sh/include/asm/syscall_32.h @@ -15,6 +15,18 @@ static inline long syscall_get_nr(struct task_struct *task, return (regs->tra >= 0) ? regs->regs[3] : -1L; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->regs[3] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -57,6 +69,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[0] = regs->regs[4]; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->regs[1] = args[5]; + regs->regs[0] = args[4]; + regs->regs[7] = args[3]; + regs->regs[6] = args[2]; + regs->regs[5] = args[1]; + regs->regs[4] = args[0]; +} + static inline int syscall_get_arch(struct task_struct *task) { int arch = AUDIT_ARCH_SH; diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 62bcafe38b1f6..1454ebe915393 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -255,7 +255,11 @@ static inline pte_t pte_mkyoung(pte_t pte) } #define PFN_PTE_SHIFT (PAGE_SHIFT - 4) -#define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot) + +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + return __pte((pfn << PFN_PTE_SHIFT) | pgprot_val(pgprot)); +} static inline unsigned long pte_pfn(pte_t pte) { @@ -272,15 +276,6 @@ static inline unsigned long pte_pfn(pte_t pte) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) -{ - return __pte((page_to_pfn(page) << (PAGE_SHIFT-4)) | pgprot_val(pgprot)); -} - static inline pte_t mk_pte_phys(unsigned long page, pgprot_t pgprot) { return __pte(((page) >> 4) | pgprot_val(pgprot)); diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index dc28f2c4eee3f..4af03e3c161b4 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -225,7 +225,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL); return __pte(paddr | pgprot_val(prot)); } -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) @@ -234,7 +233,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) return __pmd(pte_val(pte)); } -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) #endif /* This one can be done with two shifts. */ diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 20c109ac8cc99..b0233924d3232 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -25,6 +25,18 @@ static inline long syscall_get_nr(struct task_struct *task, return (syscall_p ? regs->u_regs[UREG_G1] : -1L); } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->u_regs[UREG_G1] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -117,6 +129,16 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + unsigned int i; + + for (i = 0; i < 6; i++) + regs->u_regs[UREG_I0 + i] = args[i]; +} + static inline int syscall_get_arch(struct task_struct *task) { #if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 760818950464c..25ae4c897aae7 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2878,33 +2878,27 @@ void __flush_tlb_all(void) : : "r" (pstate)); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); - pte_t *pte = NULL; - - if (page) - pte = (pte_t *) page_address(page); - - return pte; -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) +static pte_t *__pte_alloc_one(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0); if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(ptdesc)) { + if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } -void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - free_page((unsigned long)pte); + return __pte_alloc_one(mm); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return __pte_alloc_one(mm); } static void __pte_free(pgtable_t pte) @@ -2915,6 +2909,11 @@ static void __pte_free(pgtable_t pte) pagetable_free(ptdesc); } +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + __pte_free(pte); +} + void pte_free(struct mm_struct *mm, pgtable_t pte) { __pte_free(pte); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index dd32711022f55..f8fb4911d360b 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -350,7 +350,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); if (page_ref_inc_return(page) == 2 && - !pagetable_pte_ctor(page_ptdesc(page))) { + !pagetable_pte_ctor(mm, page_ptdesc(page))) { page_ref_dec(page); ptep = NULL; } diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index ab0c8dd865648..14ec16f92ce40 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -37,7 +37,6 @@ static inline void pgd_mkuptodate(pgd_t pgd) { } #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) #define pte_pfn(x) phys_to_pfn(pte_val(x)) -#define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) #endif diff --git a/arch/um/include/asm/pgtable-4level.h b/arch/um/include/asm/pgtable-4level.h index 0d279caee93cc..7a271b7b83d2b 100644 --- a/arch/um/include/asm/pgtable-4level.h +++ b/arch/um/include/asm/pgtable-4level.h @@ -102,15 +102,6 @@ static inline unsigned long pte_pfn(pte_t pte) return phys_to_pfn(pte_val(pte)); } -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) -{ - pte_t pte; - phys_t phys = pfn_to_phys(page_nr); - - pte_set_val(pte, phys, pgprot); - return pte; -} - static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 5601ca98e8a6a..ca2a519d53ab1 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -260,19 +260,17 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEEDSYNC); } -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - #define __virt_to_page(virt) phys_to_page(__pa(virt)) #define virt_to_page(addr) __virt_to_page((const unsigned long) addr) -#define mk_pte(page, pgprot) \ - ({ pte_t pte; \ - \ - pte_set_val(pte, page_to_phys(page), (pgprot)); \ - pte;}) +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + pte_t pte; + + pte_set_val(pte, pfn_to_phys(pfn), pgprot); + + return pte; +} static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h index 172b74143c4bf..bcd73bcfe5771 100644 --- a/arch/um/include/asm/syscall-generic.h +++ b/arch/um/include/asm/syscall-generic.h @@ -21,6 +21,11 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return PT_REGS_SYSCALL_NR(regs); } +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + PT_REGS_SYSCALL_NR(regs) = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -62,6 +67,20 @@ static inline void syscall_get_arguments(struct task_struct *task, *args = UPT_SYSCALL_ARG6(r); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + struct uml_pt_regs *r = ®s->regs; + + UPT_SYSCALL_ARG1(r) = *args++; + UPT_SYSCALL_ARG2(r) = *args++; + UPT_SYSCALL_ARG3(r) = *args++; + UPT_SYSCALL_ARG4(r) = *args++; + UPT_SYSCALL_ARG5(r) = *args++; + UPT_SYSCALL_ARG6(r) = *args; +} + /* See arch/x86/um/asm/syscall.h for syscall_get_arch() definition. */ #endif /* __UM_SYSCALL_GENERIC_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b9f378e05f6b..ae82a42ff70e7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2029,6 +2029,9 @@ config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG config ARCH_SUPPORTS_KEXEC_JUMP def_bool y +config ARCH_SUPPORTS_KEXEC_HANDOVER + def_bool y + config ARCH_SUPPORTS_CRASH_DUMP def_bool X86_64 || (X86_32 && HIGHMEM) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f03d59ea6e40f..ff11688810162 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -760,6 +760,55 @@ static void process_e820_entries(unsigned long minimum, } } +/* + * If KHO is active, only process its scratch areas to ensure we are not + * stepping onto preserved memory. + */ +#ifdef CONFIG_KEXEC_HANDOVER +static bool process_kho_entries(unsigned long minimum, unsigned long image_size) +{ + struct kho_scratch *kho_scratch; + struct setup_data *ptr; + int i, nr_areas = 0; + + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; + while (ptr) { + if (ptr->type == SETUP_KEXEC_KHO) { + struct kho_data *kho = (struct kho_data *)ptr->data; + + kho_scratch = (void *)kho->scratch_addr; + nr_areas = kho->scratch_size / sizeof(*kho_scratch); + + break; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + if (!nr_areas) + return false; + + for (i = 0; i < nr_areas; i++) { + struct kho_scratch *area = &kho_scratch[i]; + struct mem_vector region = { + .start = area->addr, + .size = area->size, + }; + + if (process_mem_region(®ion, minimum, image_size)) + break; + } + + return true; +} +#else +static inline bool process_kho_entries(unsigned long minimum, + unsigned long image_size) +{ + return false; +} +#endif + static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { @@ -775,7 +824,8 @@ static unsigned long find_random_phys_addr(unsigned long minimum, return 0; } - if (!process_efi_entries(minimum, image_size)) + if (!process_kho_entries(minimum, image_size) && + !process_efi_entries(minimum, image_size)) process_e820_entries(minimum, image_size); phys_addr = slots_fetch_random(); diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cfb5ca41e30de..9fd1291e7bdfb 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 49c26ce2b1152..d328de166481e 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -21,6 +21,7 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 #define NUM_COUNTERS_L3 6 +#define NUM_COUNTERS_MAX 64 #define RDPMC_BASE_NB 6 #define RDPMC_BASE_LLC 10 @@ -38,7 +39,10 @@ struct amd_uncore_ctx { int refcnt; int cpu; struct perf_event **events; - struct hlist_node node; + unsigned long active_mask[BITS_TO_LONGS(NUM_COUNTERS_MAX)]; + int nr_active; + struct hrtimer hrtimer; + u64 hrtimer_duration; }; struct amd_uncore_pmu { @@ -83,11 +87,51 @@ struct amd_uncore { static struct amd_uncore uncores[UNCORE_TYPE_MAX]; +/* Interval for hrtimer, defaults to 60000 milliseconds */ +static unsigned int update_interval = 60 * MSEC_PER_SEC; +module_param(update_interval, uint, 0444); + static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event) { return container_of(event->pmu, struct amd_uncore_pmu, pmu); } +static enum hrtimer_restart amd_uncore_hrtimer(struct hrtimer *hrtimer) +{ + struct amd_uncore_ctx *ctx; + struct perf_event *event; + int bit; + + ctx = container_of(hrtimer, struct amd_uncore_ctx, hrtimer); + + if (!ctx->nr_active || ctx->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + + for_each_set_bit(bit, ctx->active_mask, NUM_COUNTERS_MAX) { + event = ctx->events[bit]; + event->pmu->read(event); + } + + hrtimer_forward_now(hrtimer, ns_to_ktime(ctx->hrtimer_duration)); + return HRTIMER_RESTART; +} + +static void amd_uncore_start_hrtimer(struct amd_uncore_ctx *ctx) +{ + hrtimer_start(&ctx->hrtimer, ns_to_ktime(ctx->hrtimer_duration), + HRTIMER_MODE_REL_PINNED_HARD); +} + +static void amd_uncore_cancel_hrtimer(struct amd_uncore_ctx *ctx) +{ + hrtimer_cancel(&ctx->hrtimer); +} + +static void amd_uncore_init_hrtimer(struct amd_uncore_ctx *ctx) +{ + hrtimer_setup(&ctx->hrtimer, amd_uncore_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); +} + static void amd_uncore_read(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -118,18 +162,26 @@ static void amd_uncore_read(struct perf_event *event) static void amd_uncore_start(struct perf_event *event, int flags) { + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; + if (!ctx->nr_active++) + amd_uncore_start_hrtimer(ctx); + if (flags & PERF_EF_RELOAD) wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); hwc->state = 0; + __set_bit(hwc->idx, ctx->active_mask); wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); perf_event_update_userpage(event); } static void amd_uncore_stop(struct perf_event *event, int flags) { + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; wrmsrl(hwc->config_base, hwc->config); @@ -139,6 +191,11 @@ static void amd_uncore_stop(struct perf_event *event, int flags) event->pmu->read(event); hwc->state |= PERF_HES_UPTODATE; } + + if (!--ctx->nr_active) + amd_uncore_cancel_hrtimer(ctx); + + __clear_bit(hwc->idx, ctx->active_mask); } static int amd_uncore_add(struct perf_event *event, int flags) @@ -491,6 +548,9 @@ static int amd_uncore_ctx_init(struct amd_uncore *uncore, unsigned int cpu) goto fail; } + amd_uncore_init_hrtimer(curr); + curr->hrtimer_duration = (u64)update_interval * NSEC_PER_MSEC; + cpumask_set_cpu(cpu, &pmu->active_mask); } @@ -880,16 +940,55 @@ static int amd_uncore_umc_event_init(struct perf_event *event) static void amd_uncore_umc_start(struct perf_event *event, int flags) { + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; + if (!ctx->nr_active++) + amd_uncore_start_hrtimer(ctx); + if (flags & PERF_EF_RELOAD) wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); hwc->state = 0; + __set_bit(hwc->idx, ctx->active_mask); wrmsrl(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC)); perf_event_update_userpage(event); } +static void amd_uncore_umc_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, new, shift; + s64 delta; + + shift = COUNTER_SHIFT + 1; + prev = local64_read(&hwc->prev_count); + + /* + * UMC counters do not have RDPMC assignments. Read counts directly + * from the corresponding PERF_CTR. + */ + rdmsrl(hwc->event_base, new); + + /* + * Unlike the other uncore counters, UMC counters saturate and set the + * Overflow bit (bit 48) on overflow. Since they do not roll over, + * proactively reset the corresponding PERF_CTR when bit 47 is set so + * that the counter never gets a chance to saturate. + */ + if (new & BIT_ULL(63 - COUNTER_SHIFT)) { + wrmsrl(hwc->event_base, 0); + local64_set(&hwc->prev_count, 0); + } else { + local64_set(&hwc->prev_count, new); + } + + delta = (new << shift) - (prev << shift); + delta >>= shift; + local64_add(delta, &event->count); +} + static void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu) { @@ -968,7 +1067,7 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu) .del = amd_uncore_del, .start = amd_uncore_umc_start, .stop = amd_uncore_stop, - .read = amd_uncore_read, + .read = amd_uncore_umc_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, .module = THIS_MODULE, }; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b5..995df8f392b67 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -95,6 +95,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -674,6 +679,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; @@ -754,7 +760,7 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { int i; @@ -1683,6 +1689,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; + u64 last_period; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1702,6 +1709,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) continue; event = cpuc->events[idx]; + last_period = event->hw.last_period; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) @@ -1715,7 +1723,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) if (!static_call(x86_pmu_set_period)(event)) continue; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); @@ -2046,6 +2054,11 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_filter, x86_pmu.filter); static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); + + static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable); + static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable); + static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all); + static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all); } static void _x86_pmu_read(struct perf_event *event) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index a95e6c91c4d7c..9560f693fac07 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -80,54 +80,54 @@ static void * bts_buffer_setup_aux(struct perf_event *event, void **pages, int nr_pages, bool overwrite) { - struct bts_buffer *buf; + struct bts_buffer *bb; struct page *page; int cpu = event->cpu; int node = (cpu == -1) ? cpu : cpu_to_node(cpu); unsigned long offset; size_t size = nr_pages << PAGE_SHIFT; - int pg, nbuf, pad; + int pg, nr_buf, pad; /* count all the high order buffers */ - for (pg = 0, nbuf = 0; pg < nr_pages;) { + for (pg = 0, nr_buf = 0; pg < nr_pages;) { page = virt_to_page(pages[pg]); pg += buf_nr_pages(page); - nbuf++; + nr_buf++; } /* * to avoid interrupts in overwrite mode, only allow one physical */ - if (overwrite && nbuf > 1) + if (overwrite && nr_buf > 1) return NULL; - buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); - if (!buf) + bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node); + if (!bb) return NULL; - buf->nr_pages = nr_pages; - buf->nr_bufs = nbuf; - buf->snapshot = overwrite; - buf->data_pages = pages; - buf->real_size = size - size % BTS_RECORD_SIZE; + bb->nr_pages = nr_pages; + bb->nr_bufs = nr_buf; + bb->snapshot = overwrite; + bb->data_pages = pages; + bb->real_size = size - size % BTS_RECORD_SIZE; - for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { + for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) { unsigned int __nr_pages; page = virt_to_page(pages[pg]); __nr_pages = buf_nr_pages(page); - buf->buf[nbuf].page = page; - buf->buf[nbuf].offset = offset; - buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); - buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; - pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; - buf->buf[nbuf].size -= pad; + bb->buf[nr_buf].page = page; + bb->buf[nr_buf].offset = offset; + bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); + bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement; + pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE; + bb->buf[nr_buf].size -= pad; pg += __nr_pages; offset += __nr_pages << PAGE_SHIFT; } - return buf; + return bb; } static void bts_buffer_free_aux(void *data) @@ -135,25 +135,25 @@ static void bts_buffer_free_aux(void *data) kfree(data); } -static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) +static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx) { - return buf->buf[idx].offset + buf->buf[idx].displacement; + return bb->buf[idx].offset + bb->buf[idx].displacement; } static void -bts_config_buffer(struct bts_buffer *buf) +bts_config_buffer(struct bts_buffer *bb) { int cpu = raw_smp_processor_id(); struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - struct bts_phys *phys = &buf->buf[buf->cur_buf]; + struct bts_phys *phys = &bb->buf[bb->cur_buf]; unsigned long index, thresh = 0, end = phys->size; struct page *page = phys->page; - index = local_read(&buf->head); + index = local_read(&bb->head); - if (!buf->snapshot) { - if (buf->end < phys->offset + buf_size(page)) - end = buf->end - phys->offset - phys->displacement; + if (!bb->snapshot) { + if (bb->end < phys->offset + buf_size(page)) + end = bb->end - phys->offset - phys->displacement; index -= phys->offset + phys->displacement; @@ -168,7 +168,7 @@ bts_config_buffer(struct bts_buffer *buf) ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; ds->bts_index = ds->bts_buffer_base + index; ds->bts_absolute_maximum = ds->bts_buffer_base + end; - ds->bts_interrupt_threshold = !buf->snapshot + ds->bts_interrupt_threshold = !bb->snapshot ? ds->bts_buffer_base + thresh : ds->bts_absolute_maximum + BTS_RECORD_SIZE; } @@ -184,16 +184,16 @@ static void bts_update(struct bts_ctx *bts) { int cpu = raw_smp_processor_id(); struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *bb = perf_get_aux(&bts->handle); unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; - if (!buf) + if (!bb) return; - head = index + bts_buffer_offset(buf, buf->cur_buf); - old = local_xchg(&buf->head, head); + head = index + bts_buffer_offset(bb, bb->cur_buf); + old = local_xchg(&bb->head, head); - if (!buf->snapshot) { + if (!bb->snapshot) { if (old == head) return; @@ -205,9 +205,9 @@ static void bts_update(struct bts_ctx *bts) * old and head are always in the same physical buffer, so we * can subtract them to get the data size. */ - local_add(head - old, &buf->data_size); + local_add(head - old, &bb->data_size); } else { - local_set(&buf->data_size, head); + local_set(&bb->data_size, head); } /* @@ -218,7 +218,7 @@ static void bts_update(struct bts_ctx *bts) } static int -bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); +bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle); /* * Ordering PMU callbacks wrt themselves and the PMI is done by means @@ -232,17 +232,17 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *bb = perf_get_aux(&bts->handle); u64 config = 0; - if (!buf->snapshot) + if (!bb->snapshot) config |= ARCH_PERFMON_EVENTSEL_INT; if (!event->attr.exclude_kernel) config |= ARCH_PERFMON_EVENTSEL_OS; if (!event->attr.exclude_user) config |= ARCH_PERFMON_EVENTSEL_USR; - bts_config_buffer(buf); + bts_config_buffer(bb); /* * local barrier to make sure that ds configuration made it @@ -261,13 +261,13 @@ static void bts_event_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(bts_ctx); - struct bts_buffer *buf; + struct bts_buffer *bb; - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) + bb = perf_aux_output_begin(&bts->handle, event); + if (!bb) goto fail_stop; - if (bts_buffer_reset(buf, &bts->handle)) + if (bts_buffer_reset(bb, &bts->handle)) goto fail_end_stop; bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; @@ -306,27 +306,27 @@ static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(bts_ctx); - struct bts_buffer *buf = NULL; + struct bts_buffer *bb = NULL; int state = READ_ONCE(bts->state); if (state == BTS_STATE_ACTIVE) __bts_event_stop(event, BTS_STATE_STOPPED); if (state != BTS_STATE_STOPPED) - buf = perf_get_aux(&bts->handle); + bb = perf_get_aux(&bts->handle); event->hw.state |= PERF_HES_STOPPED; if (flags & PERF_EF_UPDATE) { bts_update(bts); - if (buf) { - if (buf->snapshot) + if (bb) { + if (bb->snapshot) bts->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); + local_xchg(&bb->data_size, + bb->nr_pages << PAGE_SHIFT); perf_aux_output_end(&bts->handle, - local_xchg(&buf->data_size, 0)); + local_xchg(&bb->data_size, 0)); } cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; @@ -382,19 +382,19 @@ void intel_bts_disable_local(void) } static int -bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) +bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle) { unsigned long head, space, next_space, pad, gap, skip, wakeup; unsigned int next_buf; struct bts_phys *phys, *next_phys; int ret; - if (buf->snapshot) + if (bb->snapshot) return 0; - head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); + head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1); - phys = &buf->buf[buf->cur_buf]; + phys = &bb->buf[bb->cur_buf]; space = phys->offset + phys->displacement + phys->size - head; pad = space; if (space > handle->size) { @@ -403,10 +403,10 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) } if (space <= BTS_SAFETY_MARGIN) { /* See if next phys buffer has more space */ - next_buf = buf->cur_buf + 1; - if (next_buf >= buf->nr_bufs) + next_buf = bb->cur_buf + 1; + if (next_buf >= bb->nr_bufs) next_buf = 0; - next_phys = &buf->buf[next_buf]; + next_phys = &bb->buf[next_buf]; gap = buf_size(phys->page) - phys->displacement - phys->size + next_phys->displacement; skip = pad + gap; @@ -431,8 +431,8 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) * anymore, so we must not be racing with * bts_update(). */ - buf->cur_buf = next_buf; - local_set(&buf->head, head); + bb->cur_buf = next_buf; + local_set(&bb->head, head); } } } @@ -445,7 +445,7 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) space -= space % BTS_RECORD_SIZE; } - buf->end = head + space; + bb->end = head + space; /* * If we have no space, the lost notification would have been sent when @@ -462,7 +462,7 @@ int intel_bts_interrupt(void) struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; struct bts_ctx *bts; struct perf_event *event; - struct bts_buffer *buf; + struct bts_buffer *bb; s64 old_head; int err = -ENOSPC, handled = 0; @@ -485,8 +485,8 @@ int intel_bts_interrupt(void) if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) return handled; - buf = perf_get_aux(&bts->handle); - if (!buf) + bb = perf_get_aux(&bts->handle); + if (!bb) return handled; /* @@ -494,26 +494,26 @@ int intel_bts_interrupt(void) * there's no other way of telling, because the pointer will * keep moving */ - if (buf->snapshot) + if (bb->snapshot) return 0; - old_head = local_read(&buf->head); + old_head = local_read(&bb->head); bts_update(bts); /* no new data */ - if (old_head == local_read(&buf->head)) + if (old_head == local_read(&bb->head)) return handled; - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0)); + perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0)); - buf = perf_aux_output_begin(&bts->handle, event); - if (buf) - err = bts_buffer_reset(buf, &bts->handle); + bb = perf_aux_output_begin(&bts->handle, event); + if (bb) + err = bts_buffer_reset(bb, &bts->handle); if (err) { WRITE_ONCE(bts->state, BTS_STATE_STOPPED); - if (buf) { + if (bb) { /* * BTS_STATE_STOPPED should be visible before * cleared handle::event @@ -599,7 +599,11 @@ static void bts_event_read(struct perf_event *event) static __init int bts_init(void) { - if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) + if (!boot_cpu_has(X86_FEATURE_DTES64)) + return -ENODEV; + + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); + if (!x86_pmu.bts) return -ENODEV; if (boot_cpu_has(X86_FEATURE_PTI)) { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 00dfe487bd00e..cd63292073113 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2224,6 +2224,18 @@ static struct extra_reg intel_cmt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02"); + +static struct attribute *skt_events_attrs[] = { + EVENT_PTR(td_fe_bound_skt), + EVENT_PTR(td_retiring_skt), + EVENT_PTR(td_bad_spec_cmt), + EVENT_PTR(td_be_bound_skt), + NULL, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -2294,7 +2306,7 @@ static __always_inline void __intel_pmu_disable_all(bool bts) static __always_inline void intel_pmu_disable_all(void) { __intel_pmu_disable_all(true); - intel_pmu_pebs_disable_all(); + static_call_cond(x86_pmu_pebs_disable_all)(); intel_pmu_lbr_disable_all(); } @@ -2326,7 +2338,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) static void intel_pmu_enable_all(int added) { - intel_pmu_pebs_enable_all(); + static_call_cond(x86_pmu_pebs_enable_all)(); __intel_pmu_enable_all(added, false); } @@ -2583,7 +2595,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * so we don't trigger the event without PEBS bit set. */ if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); + static_call(x86_pmu_pebs_disable)(event); } static void intel_pmu_assign_event(struct perf_event *event, int idx) @@ -2603,6 +2615,9 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } static int icl_set_topdown_event_period(struct perf_event *event) @@ -2880,6 +2895,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val |= bits; } +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int msr_b, msr_c; + + if (!mask && !cpuc->acr_cfg_b[idx]) + return; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + } else { + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; + idx -= INTEL_PMC_IDX_FIXED; + } + + if (cpuc->acr_cfg_b[idx] != mask) { + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + cpuc->acr_cfg_b[idx] = mask; + } + /* Only need to update the reload value when there is a valid config value. */ + if (mask && cpuc->acr_cfg_c[idx] != reload) { + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + cpuc->acr_cfg_c[idx] = reload; + } +} + +static void intel_pmu_enable_acr(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_acr_event_group(event) || !event->attr.config2) { + /* + * The disable doesn't clear the ACR CFG register. + * Check and clear the ACR CFG register. + */ + intel_pmu_config_acr(hwc->idx, 0, 0); + return; + } + + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2887,16 +2948,19 @@ static void intel_pmu_enable_event(struct perf_event *event) int idx = hwc->idx; if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_enable(event); + static_call(x86_pmu_pebs_enable)(event); switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: if (branch_sample_counters(event)) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); + static_call_cond(intel_pmu_enable_acr_event)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_enable_acr_event)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); break; @@ -2914,12 +2978,51 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) +{ + struct perf_event *event, *leader; + int i, j, idx; + + for (i = 0; i < cpuc->n_events; i++) { + leader = cpuc->event_list[i]; + if (!is_acr_event_group(leader)) + continue; + + /* The ACR events must be contiguous. */ + for (j = i; j < cpuc->n_events; j++) { + event = cpuc->event_list[j]; + if (event->group_leader != leader->group_leader) + break; + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + return; + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + } + i = j - 1; + } +} + +void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!cpuc->n_late_setup) + return; + + intel_pmu_pebs_late_setup(cpuc); + intel_pmu_acr_late_setup(cpuc); +} + static void intel_pmu_add_event(struct perf_event *event) { if (event->attr.precise_ip) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } /* @@ -3141,6 +3244,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + u64 last_period; handled++; @@ -3168,10 +3272,12 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (is_pebs_counter_event_group(event)) x86_pmu.drain_pebs(regs, &data); + last_period = event->hw.last_period; + if (!intel_pmu_save_and_restart(event)) continue; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); if (has_branch_stack(event)) intel_pmu_lbr_save_brstack(&data, cpuc, event); @@ -3739,10 +3845,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - /* Not all counters support the branch counter feature. */ - if (branch_sample_counters(event)) { + if (event->hw.dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->idxmsk64 &= event->hw.dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4083,6 +4188,39 @@ static u64 intel_pmu_freq_start_period(struct perf_event *event) return start; } +static inline bool intel_pmu_has_acr(struct pmu *pmu) +{ + return !!hybrid(pmu, acr_cause_mask64); +} + +static bool intel_pmu_is_acr_group(struct perf_event *event) +{ + /* The group leader has the ACR flag set */ + if (is_acr_event_group(event)) + return true; + + /* The acr_mask is set */ + if (event->attr.config2) + return true; + + return false; +} + +static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, + u64 *cause_mask, int *num) +{ + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + *cause_mask |= event->attr.config2; + *num += 1; +} + +static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, + int idx, u64 cause_mask) +{ + if (test_bit(idx, (unsigned long *)&cause_mask)) + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -4144,15 +4282,19 @@ static int intel_pmu_hw_config(struct perf_event *event) leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; - if (branch_sample_counters(leader)) + if (branch_sample_counters(leader)) { num++; + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; - if (branch_sample_counters(sibling)) + if (branch_sample_counters(sibling)) { num++; + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + } } if (num > fls(x86_pmu.lbr_counters)) @@ -4207,6 +4349,94 @@ static int intel_pmu_hw_config(struct perf_event *event) event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { + struct perf_event *sibling, *leader = event->group_leader; + struct pmu *pmu = event->pmu; + bool has_sw_event = false; + int num = 0, idx = 0; + u64 cause_mask = 0; + + /* Not support perf metrics */ + if (is_metric_event(event)) + return -EINVAL; + + /* Not support freq mode */ + if (event->attr.freq) + return -EINVAL; + + /* PDist is not supported */ + if (event->attr.config2 && event->attr.precise_ip > 2) + return -EINVAL; + + /* The reload value cannot exceeds the max period */ + if (event->attr.sample_period > x86_pmu.max_period) + return -EINVAL; + /* + * The counter-constraints of each event cannot be finalized + * unless the whole group is scanned. However, it's hard + * to know whether the event is the last one of the group. + * Recalculate the counter-constraints for each event when + * adding a new event. + * + * The group is traversed twice, which may be optimized later. + * In the first round, + * - Find all events which do reload when other events + * overflow and set the corresponding counter-constraints + * - Add all events, which can cause other events reload, + * in the cause_mask + * - Error out if the number of events exceeds the HW limit + * - The ACR events must be contiguous. + * Error out if there are non-X86 events between ACR events. + * This is not a HW limit, but a SW limit. + * With the assumption, the intel_pmu_acr_late_setup() can + * easily convert the event idx to counter idx without + * traversing the whole event list. + */ + if (!is_x86_event(leader)) + return -EINVAL; + + if (leader->attr.config2) + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) { + has_sw_event = true; + continue; + } + if (!sibling->attr.config2) + continue; + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); + } + } + if (leader != event && event->attr.config2) { + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); + } + + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) + return -EINVAL; + /* + * In the second round, apply the counter-constraints for + * the events which can cause other events reload. + */ + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); + } + + if (leader != event) + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); + + leader->hw.flags |= PERF_X86_EVENT_ACR; + } + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -4354,7 +4584,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, }; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return arr; /* @@ -4952,7 +5182,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -5041,7 +5271,7 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } -static void update_pmu_cap(struct x86_hybrid_pmu *pmu) +static void update_pmu_cap(struct pmu *pmu) { unsigned int cntr, fixed_cntr, ecx, edx; union cpuid35_eax eax; @@ -5050,20 +5280,30 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); if (ebx.split.umask2) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; if (ebx.split.eq) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &cntr, &fixed_cntr, &ecx, &edx); - pmu->cntr_mask64 = cntr; - pmu->fixed_cntr_mask64 = fixed_cntr; + hybrid(pmu, cntr_mask64) = cntr; + hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; + } + + if (eax.split.acr_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, + &cntr, &fixed_cntr, &ecx, &edx); + /* The mask of the counters which can be reloaded */ + hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); + + /* The mask of the counters which can cause a reload of reloadable counters */ + hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); } if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ - rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities); } } @@ -5150,7 +5390,7 @@ static bool init_hybrid_pmu(int cpu) goto end; if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) - update_pmu_cap(pmu); + update_pmu_cap(&pmu->pmu); intel_pmu_check_hybrid_pmus(pmu); @@ -5524,7 +5764,7 @@ static __init void intel_clovertown_quirk(void) * these chips. */ pr_warn("PEBS disabled due to CPU errata\n"); - x86_pmu.pebs = 0; + x86_pmu.ds_pebs = 0; x86_pmu.pebs_constraints = NULL; } @@ -6012,7 +6252,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.pebs ? attr->mode : 0; + return x86_pmu.ds_pebs ? attr->mode : 0; } static umode_t @@ -6043,6 +6283,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i) return attr->mode; } +PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); + +static struct attribute *format_acr_attrs[] = { + &format_attr_acr_mask.attr, + NULL +}; + +static umode_t +acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; +} + static struct attribute_group group_events_td = { .name = "events", .is_visible = td_is_visible, @@ -6085,6 +6340,12 @@ static struct attribute_group group_format_evtsel_ext = { .is_visible = evtsel_ext_is_visible, }; +static struct attribute_group group_format_acr = { + .name = "format", + .attrs = format_acr_attrs, + .is_visible = acr_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -6099,6 +6360,7 @@ static const struct attribute_group *attr_update[] = { &group_format_extra, &group_format_extra_skl, &group_format_evtsel_ext, + &group_format_acr, &group_default, NULL, }; @@ -6383,6 +6645,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_lbr, &hybrid_group_format_extra, &group_format_evtsel_ext, + &group_format_acr, &group_default, &hybrid_group_cpus, NULL, @@ -6575,6 +6838,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) intel_pmu_init_grt(pmu); hybrid(pmu, event_constraints) = intel_skt_event_constraints; hybrid(pmu, extra_regs) = intel_cmt_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } __init int intel_pmu_init(void) @@ -6635,6 +6899,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; + x86_pmu.config_mask = X86_RAW_EVENT_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -6663,7 +6928,7 @@ __init int intel_pmu_init(void) if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) intel_pmu_arch_lbr_init(); - intel_ds_init(); + intel_pebs_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ @@ -6673,6 +6938,12 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* + * Many features on and after V6 require dynamic constraint, + * e.g., Arch PEBS, ACR. + */ + if (version >= 6) + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; /* * Install the hw-cache-events table: */ @@ -6884,6 +7155,18 @@ __init int intel_pmu_init(void) name = "crestmont"; break; + case INTEL_ATOM_DARKMONT_X: + intel_pmu_init_skt(NULL); + intel_pmu_pebs_data_source_cmt(); + x86_pmu.pebs_latency_data = cmt_latency_data; + x86_pmu.get_event_constraints = cmt_get_event_constraints; + td_attr = skt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = cmt_format_attr; + pr_cont("Darkmont events, "); + name = "darkmont"; + break; + case INTEL_WESTMERE: case INTEL_WESTMERE_EP: case INTEL_WESTMERE_EX: @@ -7433,6 +7716,18 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = hybrid_attr_update; } + /* + * The archPerfmonExt (0x23) includes an enhanced enumeration of + * PMU architectural features with a per-core view. For non-hybrid, + * each core has the same PMU capabilities. It's good enough to + * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu + * is used to keep the common capabilities. Still keep the values + * from the leaf 0xa. The core specific update will be done later + * when a new type is online. + */ + if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) + update_pmu_cap(NULL); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 18c3ab579b8b3..e216622b94dc2 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -624,7 +624,7 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); @@ -659,7 +659,7 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -734,7 +734,7 @@ void release_ds_buffers(void) { int cpu; - if (!x86_pmu.bts && !x86_pmu.pebs) + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; for_each_possible_cpu(cpu) @@ -750,7 +750,8 @@ void release_ds_buffers(void) } for_each_possible_cpu(cpu) { - release_pebs_buffer(cpu); + if (x86_pmu.ds_pebs) + release_pebs_buffer(cpu); release_bts_buffer(cpu); } } @@ -761,15 +762,17 @@ void reserve_ds_buffers(void) int cpu; x86_pmu.bts_active = 0; - x86_pmu.pebs_active = 0; - if (!x86_pmu.bts && !x86_pmu.pebs) + if (x86_pmu.ds_pebs) + x86_pmu.pebs_active = 0; + + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; if (!x86_pmu.bts) bts_err = 1; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) pebs_err = 1; for_each_possible_cpu(cpu) { @@ -781,7 +784,8 @@ void reserve_ds_buffers(void) if (!bts_err && alloc_bts_buffer(cpu)) bts_err = 1; - if (!pebs_err && alloc_pebs_buffer(cpu)) + if (x86_pmu.ds_pebs && !pebs_err && + alloc_pebs_buffer(cpu)) pebs_err = 1; if (bts_err && pebs_err) @@ -793,7 +797,7 @@ void reserve_ds_buffers(void) release_bts_buffer(cpu); } - if (pebs_err) { + if (x86_pmu.ds_pebs && pebs_err) { for_each_possible_cpu(cpu) release_pebs_buffer(cpu); } @@ -805,7 +809,7 @@ void reserve_ds_buffers(void) if (x86_pmu.bts && !bts_err) x86_pmu.bts_active = 1; - if (x86_pmu.pebs && !pebs_err) + if (x86_pmu.ds_pebs && !pebs_err) x86_pmu.pebs_active = 1; for_each_possible_cpu(cpu) { @@ -1355,9 +1359,8 @@ static void __intel_pmu_pebs_update_cfg(struct perf_event *event, } -static void intel_pmu_late_setup(void) +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; u64 pebs_data_cfg = 0; int i; @@ -2652,10 +2655,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d } /* - * BTS, PEBS probe and setup + * PEBS probe and setup */ -void __init intel_ds_init(void) +void __init intel_pebs_init(void) { /* * No support for 32bit formats @@ -2663,13 +2666,12 @@ void __init intel_ds_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64)) return; - x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); - x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + x86_pmu.ds_pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; if (x86_pmu.version <= 4) x86_pmu.pebs_no_isolation = 1; - if (x86_pmu.pebs) { + if (x86_pmu.ds_pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; char *pebs_qual = ""; int format = x86_pmu.intel_cap.pebs_format; @@ -2677,6 +2679,11 @@ void __init intel_ds_init(void) if (format < 4) x86_pmu.intel_cap.pebs_baseline = 0; + x86_pmu.pebs_enable = intel_pmu_pebs_enable; + x86_pmu.pebs_disable = intel_pmu_pebs_disable; + x86_pmu.pebs_enable_all = intel_pmu_pebs_enable_all; + x86_pmu.pebs_disable_all = intel_pmu_pebs_disable_all; + switch (format) { case 0: pr_cont("PEBS fmt0%c, ", pebs_type); @@ -2761,7 +2768,7 @@ void __init intel_ds_init(void) default: pr_cont("no PEBS fmt%d%c, ", format, pebs_type); - x86_pmu.pebs = 0; + x86_pmu.ds_pebs = 0; } } } @@ -2770,7 +2777,7 @@ void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); - if (!x86_pmu.bts && !x86_pmu.pebs) + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c index 034a1f6a457c6..3e8ec049b46d1 100644 --- a/arch/x86/events/intel/knc.c +++ b/arch/x86/events/intel/knc.c @@ -241,16 +241,18 @@ static int knc_pmu_handle_irq(struct pt_regs *regs) for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + u64 last_period; handled++; if (!test_bit(bit, cpuc->active_mask)) continue; + last_period = event->hw.last_period; if (!intel_pmu_save_and_restart(event)) continue; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f44c3d866f248..05acd6449cebd 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1618,7 +1618,7 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_nr = lbr_nr; if (!!x86_pmu.lbr_counters) - x86_pmu.flags |= PMU_FL_BR_CNTR; + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index a34e50fc4a8fc..5811e172f721b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -305,17 +305,11 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) { struct intel_uncore_box *box; struct perf_event *event; - unsigned long flags; int bit; box = container_of(hrtimer, struct intel_uncore_box, hrtimer); if (!box->n_active || box->cpu != smp_processor_id()) return HRTIMER_NORESTART; - /* - * disable local interrupt to prevent uncore_pmu_event_start/stop - * to interrupt the update process - */ - local_irq_save(flags); /* * handle boxes with an active event list as opposed to active @@ -328,8 +322,6 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) uncore_perf_event_update(box, box->events[bit]); - local_irq_restore(flags); - hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); return HRTIMER_RESTART; } @@ -337,7 +329,7 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) @@ -347,7 +339,7 @@ void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) { - hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2c0ce0e9545e5..d201e6ac2edee 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -120,6 +120,11 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; } +static inline bool is_acr_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -261,6 +266,7 @@ struct cpu_hw_events { struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; int n_excl; /* the number of exclusive events */ + int n_late_setup; /* the num of events needs late setup */ unsigned int txn_flags; int is_fake; @@ -286,6 +292,10 @@ struct cpu_hw_events { u64 fixed_ctrl_val; u64 active_fixed_ctrl_val; + /* Intel ACR configuration */ + u64 acr_cfg_b[X86_PMC_IDX_MAX]; + u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* * Intel LBR bits */ @@ -707,6 +717,15 @@ struct x86_hybrid_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; struct event_constraint unconstrained; u64 hw_cache_event_ids @@ -789,6 +808,10 @@ struct x86_pmu { int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); void (*late_setup)(void); + void (*pebs_enable)(struct perf_event *event); + void (*pebs_disable)(struct perf_event *event); + void (*pebs_enable_all)(void); + void (*pebs_disable_all)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -805,6 +828,14 @@ struct x86_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; int cntval_bits; u64 cntval_mask; union { @@ -871,7 +902,7 @@ struct x86_pmu { */ unsigned int bts :1, bts_active :1, - pebs :1, + ds_pebs :1, pebs_active :1, pebs_broken :1, pebs_prec_dist :1, @@ -1042,6 +1073,7 @@ do { \ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ +#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -1084,6 +1116,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } +int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; @@ -1091,6 +1124,10 @@ DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); +DECLARE_STATIC_CALL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); +DECLARE_STATIC_CALL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); +DECLARE_STATIC_CALL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); +DECLARE_STATIC_CALL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1580,6 +1617,8 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); +void intel_pmu_late_setup(void); + u64 grt_latency_data(struct perf_event *event, u64 status); u64 cmt_latency_data(struct perf_event *event, u64 status); @@ -1636,11 +1675,13 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc); + void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); -void intel_ds_init(void); +void intel_pebs_init(void); void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, struct cpu_hw_events *cpuc, diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1d9e385649b57..70078334e4a33 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -2,23 +2,24 @@ /* * struct hw_perf_event.flags flags */ -PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ -PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ -PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ -PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ -PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ -PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ -PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ -PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ -PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ -PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ -PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ -PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ -PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ -PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ -PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ -PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ -PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ -PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ +PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ +PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e6134ef2263d5..53da787b93268 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -594,7 +594,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* KeyID partitioning between MKTME and TDX */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 812dac3f79f04..70d1d94aca7e6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -195,6 +195,7 @@ union cpuid10_edx { */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +#define ARCH_PERFMON_ACR_LEAF 0x2 union cpuid35_eax { struct { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7bd6bd6df4a11..3f59d7a160100 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -784,6 +784,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + /* This bit combination is used to mark shadow stacks */ + WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == + _PAGE_DIRTY); pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); @@ -1080,22 +1083,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * (Currently stuck as a macro because of indirect forward reference - * to linux/mm.h:page_to_nid()) - */ -#define mk_pte(page, pgprot) \ -({ \ - pgprot_t __pgprot = pgprot; \ - \ - WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \ - _PAGE_DIRTY); \ - pfn_pte(page_to_pfn(page), __pgprot); \ -}) - static inline int pmd_bad(pmd_t pmd) { return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != @@ -1360,8 +1347,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ad9212df0ec0c..906ab5e6d25fe 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -67,6 +67,10 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif +#ifdef CONFIG_KEXEC_HANDOVER +#include +#endif + #ifndef _SETUP #include diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7c488ff0c7641..c10dbb74cd001 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_ax; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_ax = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[5] = regs->bp; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->bx = args[0]; + regs->cx = args[1]; + regs->dx = args[2]; + regs->si = args[3]; + regs->di = args[4]; + regs->bp = args[5]; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a75..1ee2e5115955c 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h index 50c45ead4e7c9..2671c4e1b3a0b 100644 --- a/arch/x86/include/uapi/asm/setup_data.h +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -13,7 +13,8 @@ #define SETUP_CC_BLOB 7 #define SETUP_IMA 8 #define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED +#define SETUP_KEXEC_KHO 10 +#define SETUP_ENUM_MAX SETUP_KEXEC_KHO #define SETUP_INDIRECT (1<<31) #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) @@ -78,6 +79,16 @@ struct ima_setup_data { __u64 size; } __attribute__((packed)); +/* + * Locations of kexec handover metadata + */ +struct kho_data { + __u64 fdt_addr; + __u64 fdt_size; + __u64 scratch_addr; + __u64 scratch_size; +} __attribute__((packed)); + #endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9d8dd8deb2a70..7d1c74681c619 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1299,6 +1299,24 @@ void __init e820__memblock_setup(void) memblock_add(entry->addr, entry->size); } + /* + * At this point with KHO we only allocate from scratch memory. + * At the same time, we configure memblock to only allow + * allocations from memory below ISA_END_ADDRESS which is not + * a natural scratch region, because Linux ignores memory below + * ISA_END_ADDRESS at runtime. Beside very few (if any) early + * allocations, we must allocate real-mode trapoline below + * ISA_END_ADDRESS. + * + * To make sure that we can actually perform allocations during + * this phase, let's mark memory below ISA_END_ADDRESS as scratch + * so we can allocate from there in a scratch-only world. + * + * After real mode trampoline is allocated, we clear scratch + * marking from the memory below ISA_END_ADDRESS + */ + memblock_mark_kho_scratch(0, ISA_END_ADDRESS); + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 68530fad05f74..518635cc0876c 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -233,6 +233,31 @@ setup_ima_state(const struct kimage *image, struct boot_params *params, #endif /* CONFIG_IMA_KEXEC */ } +static void setup_kho(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int setup_data_offset) +{ +#ifdef CONFIG_KEXEC_HANDOVER + struct setup_data *sd = (void *)params + setup_data_offset; + struct kho_data *kho = (void *)sd + sizeof(*sd); + + sd->type = SETUP_KEXEC_KHO; + sd->len = sizeof(struct kho_data); + + /* Only add if we have all KHO images in place */ + if (!image->kho.fdt || !image->kho.scratch) + return; + + /* Add setup data */ + kho->fdt_addr = image->kho.fdt; + kho->fdt_size = PAGE_SIZE; + kho->scratch_addr = image->kho.scratch->mem; + kho->scratch_size = image->kho.scratch->bufsz; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = params_load_addr + setup_data_offset; +#endif /* CONFIG_KEXEC_HANDOVER */ +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -312,6 +337,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct ima_setup_data); } + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + /* Setup space to store preservation metadata */ + setup_kho(image, params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct kho_data); + } + /* Setup RNG seed */ setup_rng_seed(params, params_load_addr, setup_data_offset); @@ -479,6 +511,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct kho_data); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d2a13b37833c..496dae89cf95d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -286,8 +286,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa_symbol(_brk_start), - _brk_end - _brk_start); + memblock_reserve_kern(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -360,7 +360,7 @@ static void __init early_reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image); } static void __init reserve_initrd(void) @@ -413,7 +413,7 @@ static void __init add_early_ima_buffer(u64 phys_addr) } if (data->size) { - memblock_reserve(data->addr, data->size); + memblock_reserve_kern(data->addr, data->size); ima_kexec_buffer_phys = data->addr; ima_kexec_buffer_size = data->size; } @@ -451,6 +451,28 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size) } #endif +static void __init add_kho(u64 phys_addr, u32 data_len) +{ +#ifdef CONFIG_KEXEC_HANDOVER + struct kho_data *kho; + u64 addr = phys_addr + sizeof(struct setup_data); + u64 size = data_len - sizeof(struct setup_data); + + kho = early_memremap(addr, size); + if (!kho) { + pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n", + addr, size); + return; + } + + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + + early_memunmap(kho, size); +#else + pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n"); +#endif +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -479,6 +501,9 @@ static void __init parse_setup_data(void) case SETUP_IMA: add_early_ima_buffer(pa_data); break; + case SETUP_KEXEC_KHO: + add_kho(pa_data, data_len); + break; case SETUP_RNG_SEED: data = early_memremap(pa_data, data_len); add_bootloader_randomness(data->data, data->len); @@ -553,7 +578,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) len = sizeof(*data); pa_next = data->next; - memblock_reserve(pa_data, sizeof(*data) + data->len); + memblock_reserve_kern(pa_data, sizeof(*data) + data->len); if (data->type == SETUP_INDIRECT) { len += data->len; @@ -567,7 +592,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) indirect = (struct setup_indirect *)data->data; if (indirect->type != SETUP_INDIRECT) - memblock_reserve(indirect->addr, indirect->len); + memblock_reserve_kern(indirect->addr, indirect->len); } pa_data = pa_next; @@ -770,8 +795,8 @@ static void __init early_reserve_memory(void) * __end_of_kernel_reserve symbol must be explicitly reserved with a * separate memblock_reserve() or they will be discarded. */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + memblock_reserve_kern(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * The first 4Kb of memory is a BIOS owned area, but generally it is diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 9194695662b26..d5ef04a1626d8 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -338,7 +339,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -608,6 +609,528 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; + atomic64_t ref; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_page(unsigned long vaddr) +{ + struct vm_area_struct *vma, *prev = NULL; + unsigned long prev_vm_end = PAGE_SIZE; + VMA_ITERATOR(vmi, current->mm, 0); + + vma = vma_next(&vmi); + while (vma) { + if (prev) + prev_vm_end = prev->vm_end; + if (vma->vm_start - prev_vm_end >= PAGE_SIZE) { + if (is_reachable_by_call(prev_vm_end, vaddr)) + return prev_vm_end; + if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr)) + return vma->vm_start - PAGE_SIZE; + } + prev = vma; + vma = vma_next(&vmi); + } + + return 0; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_page(vaddr); + if (!vaddr) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + atomic64_set(&tramp->ref, 1); + tramp->vaddr = vaddr; + + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) + goto free_area; + return tramp; + +free_area: + kfree(tramp); + return NULL; +} + +static struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + atomic64_inc(&tramp->ref); + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + hlist_del(&tramp->node); + kfree(tramp); +} + +static void uprobe_trampoline_put(struct uprobe_trampoline *tramp) +{ + if (tramp && atomic64_dec_and_test(&tramp->ref)) + destroy_uprobe_trampoline(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + unsigned long ip, sp, ax_r11_cx_ip[4]; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(ax_r11_cx_ip, (void __user *)regs->sp, sizeof(ax_r11_cx_ip)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = ax_r11_cx_ip[0]; + regs->r11 = ax_r11_cx_ip[1]; + regs->cx = ax_r11_cx_ip[2]; + regs->ip = ax_r11_cx_ip[3] - 5; + regs->sp += sizeof(ax_r11_cx_ip); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) + return regs->ax; + + regs->sp -= sizeof(ax_r11_cx_ip); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + ax_r11_cx_ip[0] = regs->ax; + ax_r11_cx_ip[1] = regs->r11; + ax_r11_cx_ip[2] = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (ax_r11_cx_ip[3] - 5 != regs->ip) + ax_r11_cx_ip[3] = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, ax_r11_cx_ip, sizeof(ax_r11_cx_ip)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + +enum { + OPT_PART, + OPT_INSN, + UNOPT_INT3, + UNOPT_PART, +}; + +struct write_opcode_ctx { + unsigned long base; + int update; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->update) { + case OPT_PART: + case OPT_INSN: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case UNOPT_INT3: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + case UNOPT_PART: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +static int write_insn(struct vm_area_struct *vma, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *ctx) +{ + return uprobe_write(vma, vaddr, insn, nbytes, verify_insn, true, ctx); +} + +static void relative_call(void *dest, long from, long to) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *insn; + + insn = (struct __arch_relative_insn *)dest; + insn->raddr = (s32)(to - (from + 5)); + insn->op = CALL_INSN_OPCODE; +} + +static int swbp_optimize(struct vm_area_struct *vma, unsigned long vaddr, unsigned long tramp) +{ + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + char call[5]; + int err; + + relative_call(call, vaddr, tramp); + + /* + * We are in state where breakpoint (int3) is installed on top of first + * byte of the nop5 instruction. We will do following steps to overwrite + * this to call instruction: + * + * - sync cores + * - write last 4 bytes of the call instruction + * - sync cores + * - update the call instruction opcode + */ + + text_poke_sync(); + + ctx.update = OPT_PART; + err = write_insn(vma, vaddr + 1, call + 1, 4, &ctx); + if (err) + return err; + + text_poke_sync(); + + ctx.update = OPT_INSN; + return write_insn(vma, vaddr, call, 1, &ctx); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * We need to overwrite call instruction into nop5 instruction with + * breakpoint (int3) installed on top of its first byte. We will: + * + * - overwrite call opcode with breakpoint (int3) + * - sync cores + * - write last 4 bytes of the nop5 instruction + * - sync cores + */ + + ctx.update = UNOPT_INT3; + err = write_insn(vma, vaddr, &int3, 1, &ctx); + if (err) + return err; + + text_poke_sync(); + + ctx.update = UNOPT_PART; + err = write_insn(vma, vaddr + 1, (uprobe_opcode_t *) auprobe->insn + 1, 4, &ctx); + + text_poke_sync(); + return err; +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + err = is_optimized(vma->vm_mm, vaddr, &optimized); + if (err || optimized) + return err; + } + return uprobe_write_opcode(vma, vaddr, UPROBE_SWBP_INSN, true); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + struct mm_struct *mm = vma->vm_mm; + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + WARN_ON_ONCE(swbp_unoptimize(auprobe, vma, vaddr)); + } + return uprobe_write_opcode(vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false); +} + +static int __arch_uprobe_optimize(struct mm_struct *mm, unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -1; + tramp = uprobe_trampoline_get(vaddr); + if (!tramp) + return -1; + err = swbp_optimize(vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err)) + uprobe_trampoline_put(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + /* + * Do not optimize if shadow stack is enabled, the return address hijack + * code in arch_uretprobe_hijack_return_addr updates wrong frame when + * the entry uprobe is optimized and the shadow stack crashes the app. + */ + if (shstk_is_enabled()) + return; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -621,6 +1144,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -840,6 +1367,11 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) insn_byte_t p; int i; + /* x86_nops[insn->length]; same as jmp with .offs = 0 */ + if (insn->length <= ASM_NOP_MAX && + !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + goto setup; + switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ @@ -982,6 +1514,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 89079ea73e65b..a4700ef6eb64e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -266,6 +266,32 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) st->prot_levels[level] = effective; } +static void effective_prot_pte(struct ptdump_state *st, pte_t pte) +{ + effective_prot(st, 4, pte_val(pte)); +} + +static void effective_prot_pmd(struct ptdump_state *st, pmd_t pmd) +{ + effective_prot(st, 3, pmd_val(pmd)); +} + +static void effective_prot_pud(struct ptdump_state *st, pud_t pud) +{ + effective_prot(st, 2, pud_val(pud)); +} + +static void effective_prot_p4d(struct ptdump_state *st, p4d_t p4d) +{ + effective_prot(st, 1, p4d_val(p4d)); +} + +static void effective_prot_pgd(struct ptdump_state *st, pgd_t pgd) +{ + effective_prot(st, 0, pgd_val(pgd)); +} + + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to @@ -362,6 +388,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + bool ptdump_walk_pgd_level_core(struct seq_file *m, struct mm_struct *mm, pgd_t *pgd, bool checkwx, bool dmesg) @@ -378,8 +436,17 @@ bool ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { - .note_page = note_page, - .effective_prot = effective_prot, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .effective_prot_pte = effective_prot_pte, + .effective_prot_pmd = effective_prot_pmd, + .effective_prot_pud = effective_prot_pud, + .effective_prot_p4d = effective_prot_p4d, + .effective_prot_pgd = effective_prot_pgd, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c4f6f591f2b2..8cea7dcf8fa01 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1464,16 +1464,21 @@ static unsigned long probe_memory_block_size(void) } /* - * Use max block size to minimize overhead on bare metal, where - * alignment for memory hotplug isn't a concern. + * When hotplug alignment is not a concern, maximize blocksize + * to minimize overhead. Otherwise, align to the lesser of advice + * alignment and end of memory alignment. */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = memory_block_advised_max_size(); + if (!bz) { bz = MAX_BLOCK_SIZE; - goto done; + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + goto done; + } else { + bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE); } /* Find the largest allowed block size that aligns to memory end */ - for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) break; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a05fcddfc811a..1dee9bdbeea58 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -205,7 +205,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (!ptdesc) failed = true; - if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; @@ -809,14 +809,13 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); - free_page((unsigned long)pte); + pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); - pagetable_dtor(virt_to_ptdesc(pmd)); - free_page((unsigned long)pmd); + pmd_free(&init_mm, pmd); return 1; } @@ -839,7 +838,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); - free_page((unsigned long)pte); + pte_free_kernel(&init_mm, pte); return 1; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index f9bc444a3064d..9b9f4534086d2 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -65,6 +65,8 @@ void __init reserve_real_mode(void) * setup_arch(). */ memblock_reserve(0, SZ_1M); + + memblock_clear_kho_scratch(0, SZ_1M); } static void __init sme_sev_setup_real_mode(struct trampoline_header *th) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 1647a7cc3fbf6..cb1725c40e369 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -269,17 +269,11 @@ static inline pte_t pte_mkwrite_novma(pte_t pte) ((__pgprot((pgprot_val(prot) & ~_PAGE_CA_MASK) | \ _PAGE_CA_BYPASS))) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - #define PFN_PTE_SHIFT PAGE_SHIFT #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pte_same(a,b) (pte_val(a) == pte_val(b)) #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h index 5ee974bf83305..7db3b489c8ad0 100644 --- a/arch/xtensa/include/asm/syscall.h +++ b/arch/xtensa/include/asm/syscall.h @@ -28,6 +28,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->syscall; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->syscall = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -68,6 +75,17 @@ static inline void syscall_get_arguments(struct task_struct *task, args[i] = regs->areg[reg[i]]; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + static const unsigned int reg[] = XTENSA_SYSCALL_ARGUMENT_REGS; + unsigned int i; + + for (i = 0; i < 6; ++i) + regs->areg[reg[i]] = args[i]; +} + asmlinkage long xtensa_rt_sigreturn(void); asmlinkage long xtensa_shmat(int, char __user *, int); asmlinkage long xtensa_fadvise64_64(int, int, diff --git a/ci/diffs/.keep b/ci/diffs/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch new file mode 100644 index 0000000000000..3b6139225e7bf --- /dev/null +++ b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch @@ -0,0 +1,33 @@ +From 5440a12ac8fb2a8e051c597fcf5d85b427fe612a Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 13 Oct 2023 12:44:34 -0700 +Subject: [PATCH] Revert "bpf: Avoid unnecessary audit log for CPU security + mitigations" + +This reverts commit 236334aeec0f93217cf9235f2004e61a0a1a5985. +--- + include/linux/bpf.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index f0891ba24cb1..61bde4520f5c 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void) + + static inline bool bpf_bypass_spec_v1(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + static inline bool bpf_bypass_spec_v4(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + int bpf_map_new_fd(struct bpf_map *map, int flags); +-- +2.34.1 + diff --git a/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch new file mode 100644 index 0000000000000..63bdd28adedd2 --- /dev/null +++ b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch @@ -0,0 +1,69 @@ +From c71766e8ff7a7f950522d25896fba758585500df Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 22 Apr 2024 21:14:40 -0700 +Subject: [PATCH] arch/Kconfig: Move SPECULATION_MITIGATIONS to arch/Kconfig + +SPECULATION_MITIGATIONS is currently defined only for x86. As a result, +IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) is always false for other +archs. f337a6a21e2f effectively set "mitigations=off" by default on +non-x86 archs, which is not desired behavior. Jakub observed this +change when running bpf selftests on s390 and arm64. + +Fix this by moving SPECULATION_MITIGATIONS to arch/Kconfig so that it is +available in all archs and thus can be used safely in kernel/cpu.c + +Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") +Cc: stable@vger.kernel.org +Cc: Sean Christopherson +Cc: Ingo Molnar +Cc: Daniel Sneddon +Cc: Jakub Kicinski +Signed-off-by: Song Liu +--- + arch/Kconfig | 10 ++++++++++ + arch/x86/Kconfig | 10 ---------- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 9f066785bb71..8f4af75005f8 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1609,4 +1609,14 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT + # strict alignment always, even with -falign-functions. + def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG + ++menuconfig SPECULATION_MITIGATIONS ++ bool "Mitigations for speculative execution vulnerabilities" ++ default y ++ help ++ Say Y here to enable options which enable mitigations for ++ speculative execution hardware vulnerabilities. ++ ++ If you say N, all mitigations will be disabled. You really ++ should know what you are doing to say so. ++ + endmenu +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 39886bab943a..50c890fce5e0 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2486,16 +2486,6 @@ config PREFIX_SYMBOLS + def_bool y + depends on CALL_PADDING && !CFI_CLANG + +-menuconfig SPECULATION_MITIGATIONS +- bool "Mitigations for speculative execution vulnerabilities" +- default y +- help +- Say Y here to enable options which enable mitigations for +- speculative execution hardware vulnerabilities. +- +- If you say N, all mitigations will be disabled. You really +- should know what you are doing to say so. +- + if SPECULATION_MITIGATIONS + + config MITIGATION_PAGE_TABLE_ISOLATION +-- +2.43.0 + diff --git a/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch new file mode 100644 index 0000000000000..a13d767197413 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch @@ -0,0 +1,94 @@ +From fb9a697860acd8f54f2ba6647923794378eb33da Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Sun, 26 Nov 2023 21:03:42 -0800 +Subject: [PATCH] bpf: Fix a few selftest failures due to llvm18 change + +With latest upstream llvm18, the following test cases failed: + + $ ./test_progs -j + #13/2 bpf_cookie/multi_kprobe_link_api:FAIL + #13/3 bpf_cookie/multi_kprobe_attach_api:FAIL + #13 bpf_cookie:FAIL + #77 fentry_fexit:FAIL + #78/1 fentry_test/fentry:FAIL + #78 fentry_test:FAIL + #82/1 fexit_test/fexit:FAIL + #82 fexit_test:FAIL + #112/1 kprobe_multi_test/skel_api:FAIL + #112/2 kprobe_multi_test/link_api_addrs:FAIL + [...] + #112 kprobe_multi_test:FAIL + #356/17 test_global_funcs/global_func17:FAIL + #356 test_global_funcs:FAIL + +Further analysis shows llvm upstream patch [1] is responsible for the above +failures. For example, for function bpf_fentry_test7() in net/bpf/test_run.c, +without [1], the asm code is: + + 0000000000000400 : + 400: f3 0f 1e fa endbr64 + 404: e8 00 00 00 00 callq 0x409 + 409: 48 89 f8 movq %rdi, %rax + 40c: c3 retq + 40d: 0f 1f 00 nopl (%rax) + +... and with [1], the asm code is: + + 0000000000005d20 : + 5d20: e8 00 00 00 00 callq 0x5d25 + 5d25: c3 retq + +... and is called instead of +and this caused test failures for #13/#77 etc. except #356. + +For test case #356/17, with [1] (progs/test_global_func17.c)), the main prog +looks like: + + 0000000000000000 : + 0: b4 00 00 00 2a 00 00 00 w0 = 0x2a + 1: 95 00 00 00 00 00 00 00 exit + +... which passed verification while the test itself expects a verification +failure. + +Let us add 'barrier_var' style asm code in both places to prevent function +specialization which caused selftests failure. + + [1] https://github.com/llvm/llvm-project/pull/72903 + +Signed-off-by: Yonghong Song +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20231127050342.1945270-1-yonghong.song@linux.dev +--- + net/bpf/test_run.c | 2 +- + tools/testing/selftests/bpf/progs/test_global_func17.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c +index c9fdcc5cdce1..711cf5d59816 100644 +--- a/net/bpf/test_run.c ++++ b/net/bpf/test_run.c +@@ -542,7 +542,7 @@ struct bpf_fentry_test_t { + + int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) + { +- asm volatile (""); ++ asm volatile ("": "+r"(arg)); + return (long)arg; + } + +diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c +index a32e11c7d933..5de44b09e8ec 100644 +--- a/tools/testing/selftests/bpf/progs/test_global_func17.c ++++ b/tools/testing/selftests/bpf/progs/test_global_func17.c +@@ -5,6 +5,7 @@ + + __noinline int foo(int *p) + { ++ barrier_var(p); + return p ? (*p = 42) : 0; + } + +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch new file mode 100644 index 0000000000000..5832a42664706 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch @@ -0,0 +1,67 @@ +From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Thu, 30 Nov 2023 18:46:40 -0800 +Subject: [PATCH 1/1] bpf: Fix a verifier bug due to incorrect branch offset + comparison with cpu=v4 + +Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 +("bpf: Support new 32bit offset jmp instruction") added support for new +32bit offset jmp instruction. Unfortunately, in function +bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset +(plus/minor a small delta) compares to 16-bit offset bound +[S16_MIN, S16_MAX], which caused the following verification failure: + $ ./test_progs-cpuv4 -t verif_scale_pyperf180 + ... + insn 10 cannot be patched due to 16-bit range + ... + libbpf: failed to load object 'pyperf180.bpf.o' + scale_test:FAIL:expect_success unexpected error: -12 (errno 12) + #405 verif_scale_pyperf180:FAIL + +Note that due to recent llvm18 development, the patch [2] (already applied +in bpf-next) needs to be applied to bpf tree for testing purpose. + +The fix is rather simple. For 32bit offset branch insn, the adjusted +offset compares to [S32_MIN, S32_MAX] and then verification succeeded. + + [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev + [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev + +Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev +--- + kernel/bpf/core.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index cd3afe57ece3..fe254ae035fe 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) + { +- const s32 off_min = S16_MIN, off_max = S16_MAX; ++ s64 off_min, off_max, off; + s32 delta = end_new - end_old; +- s32 off; + +- if (insn->code == (BPF_JMP32 | BPF_JA)) ++ if (insn->code == (BPF_JMP32 | BPF_JA)) { + off = insn->imm; +- else ++ off_min = S32_MIN; ++ off_max = S32_MAX; ++ } else { + off = insn->off; ++ off_min = S16_MIN; ++ off_max = S16_MAX; ++ } + + if (curr < pos && curr + off + 1 >= end_old) + off += delta; +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch new file mode 100644 index 0000000000000..ea6b2386d0345 --- /dev/null +++ b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch @@ -0,0 +1,40 @@ +From patchwork Fri Aug 2 18:54:34 2024 +From: Yonghong Song +Subject: [PATCH bpf-next] selftests/bpf: Fix a btf_dump selftest failure + +Jakub reported bpf selftest "btf_dump" failure after forwarding to +v6.11-rc1 with netdev. + Error: #33 btf_dump + Error: #33/15 btf_dump/btf_dump: var_data + btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 + +The reason for the failure is due to + commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") +where percpu static variable "cpu_profile_flip" is removed. + +Let us replace "cpu_profile_flip" with a variable in bpf subsystem +so whenever that variable gets deleted or renamed, we can detect the +failure immediately. In this case, I picked a static percpu variable +"bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. + +Reported-by: Jakub Kicinski +Signed-off-by: Yonghong Song +--- + tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +index 09a8e6f9b379..b293b8501fd6 100644 +--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c ++++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +@@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + #endif +- TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, +- "static int cpu_profile_flip = (int)2", 2); ++ TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, ++ "static int bpf_cgrp_storage_busy = (int)2", 2); + } + + static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, diff --git a/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch new file mode 100644 index 0000000000000..bd12bd9b3fba5 --- /dev/null +++ b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch @@ -0,0 +1,99 @@ +From c8268f8e9fa33c32e1f2f86fc7b703408a396c70 Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 27 Oct 2023 11:24:24 -0700 +Subject: [PATCH] net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() + +With latest sync from net-next tree, bpf-next has a bpf selftest failure: + [root@arch-fb-vm1 bpf]# ./test_progs -t setget_sockopt + ... + [ 76.194349] ============================================ + [ 76.194682] WARNING: possible recursive locking detected + [ 76.195039] 6.6.0-rc7-g37884503df08-dirty #67 Tainted: G W OE + [ 76.195518] -------------------------------------------- + [ 76.195852] new_name/154 is trying to acquire lock: + [ 76.196159] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: ip_sock_set_tos+0x19/0x30 + [ 76.196669] + [ 76.196669] but task is already holding lock: + [ 76.197028] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.197517] + [ 76.197517] other info that might help us debug this: + [ 76.197919] Possible unsafe locking scenario: + [ 76.197919] + [ 76.198287] CPU0 + [ 76.198444] ---- + [ 76.198600] lock(sk_lock-AF_INET); + [ 76.198831] lock(sk_lock-AF_INET); + [ 76.199062] + [ 76.199062] *** DEADLOCK *** + [ 76.199062] + [ 76.199420] May be due to missing lock nesting notation + [ 76.199420] + [ 76.199879] 2 locks held by new_name/154: + [ 76.200131] #0: ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.200644] #1: ffffffff90f96a40 (rcu_read_lock){....}-{1:2}, at: __cgroup_bpf_run_filter_sock_ops+0x55/0x290 + [ 76.201268] + [ 76.201268] stack backtrace: + [ 76.201538] CPU: 4 PID: 154 Comm: new_name Tainted: G W OE 6.6.0-rc7-g37884503df08-dirty #67 + [ 76.202134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 + [ 76.202699] Call Trace: + [ 76.202858] + [ 76.203002] dump_stack_lvl+0x4b/0x80 + [ 76.203239] __lock_acquire+0x740/0x1ec0 + [ 76.203503] lock_acquire+0xc1/0x2a0 + [ 76.203766] ? ip_sock_set_tos+0x19/0x30 + [ 76.204050] ? sk_stream_write_space+0x12a/0x230 + [ 76.204389] ? lock_release+0xbe/0x260 + [ 76.204661] lock_sock_nested+0x32/0x80 + [ 76.204942] ? ip_sock_set_tos+0x19/0x30 + [ 76.205208] ip_sock_set_tos+0x19/0x30 + [ 76.205452] do_ip_setsockopt+0x4b3/0x1580 + [ 76.205719] __bpf_setsockopt+0x62/0xa0 + [ 76.205963] bpf_sock_ops_setsockopt+0x11/0x20 + [ 76.206247] bpf_prog_630217292049c96e_bpf_test_sockopt_int+0xbc/0x123 + [ 76.206660] bpf_prog_493685a3bae00bbd_bpf_test_ip_sockopt+0x49/0x4b + [ 76.207055] bpf_prog_b0bcd27f269aeea0_skops_sockopt+0x44c/0xec7 + [ 76.207437] __cgroup_bpf_run_filter_sock_ops+0xda/0x290 + [ 76.207829] __inet_listen_sk+0x108/0x1b0 + [ 76.208122] inet_listen+0x48/0x70 + [ 76.208373] __sys_listen+0x74/0xb0 + [ 76.208630] __x64_sys_listen+0x16/0x20 + [ 76.208911] do_syscall_64+0x3f/0x90 + [ 76.209174] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 + ... + +Both ip_sock_set_tos() and inet_listen() calls lock_sock(sk) which +caused a dead lock. + +To fix the issue, use sockopt_lock_sock() in ip_sock_set_tos() +instead. sockopt_lock_sock() will avoid lock_sock() if it is in bpf +context. + +Fixes: 878d951c6712 ("inet: lock the socket in ip_sock_set_tos()") +Suggested-by: Martin KaFai Lau +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/bpf/20231027182424.1444845-1-yonghong.song@linux.dev +--- + net/ipv4/ip_sockglue.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index 9c68b6b74d9f..2efc53526a38 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -602,9 +602,9 @@ void __ip_sock_set_tos(struct sock *sk, int val) + + void ip_sock_set_tos(struct sock *sk, int val) + { +- lock_sock(sk); ++ sockopt_lock_sock(sk); + __ip_sock_set_tos(sk, val); +- release_sock(sk); ++ sockopt_release_sock(sk); + } + EXPORT_SYMBOL(ip_sock_set_tos); + +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch new file mode 100644 index 0000000000000..da5bcdc455967 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch @@ -0,0 +1,51 @@ +From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 +From: Stanislav Fomichev +Date: Thu, 25 Jul 2024 14:40:29 -0700 +Subject: [PATCH 1/1] selftests/bpf: Filter out _GNU_SOURCE when compiling + test_cpp + +Jakub reports build failures when merging linux/master with net tree: + +CXX test_cpp +In file included from :454: +:2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] + 2 | #define _GNU_SOURCE + | ^ +:445:9: note: previous definition is here + 445 | #define _GNU_SOURCE 1 + +The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to +CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. +Apparently clang++ also unconditionally adds it for the C++ targets [0] +which causes a conflict. Add small change in the selftests makefile +to filter it out for test_cpp. + +Not sure which tree it should go via, targeting bpf for now, but net +might be better? + +0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off + +Signed-off-by: Stanislav Fomichev +Signed-off-by: Andrii Nakryiko +Acked-by: Jiri Olsa +Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me +--- + tools/testing/selftests/bpf/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index dd49c1d23a60..81d4757ecd4c 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + # Make sure we are able to include and link libbpf against c++. + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch new file mode 100644 index 0000000000000..4ebfe20b24707 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch @@ -0,0 +1,50 @@ +From f3d2080e8cf23125f79e345061149ae40f66816f Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 3 Jun 2024 23:43:17 -0700 +Subject: [PATCH bpf-next] selftests/bpf: Fix bpf_cookie and find_vma in nested + VM + +bpf_cookie and find_vma are flaky in nested VMs, which is used by some CI +systems. It turns out these failures are caused by unreliable perf event +in nested VM. Fix these by: + + 1. Use PERF_COUNT_SW_CPU_CLOCK in find_vma; + 2. Increase sample_freq in bpf_cookie. + +Signed-off-by: Song Liu +--- + tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 2 +- + tools/testing/selftests/bpf/prog_tests/find_vma.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +index 4407ea428e77..070c52c312e5 100644 +--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c ++++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +@@ -451,7 +451,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; +- attr.sample_freq = 1000; ++ attr.sample_freq = 10000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; +diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c +index 5165b38f0e59..f7619e0ade10 100644 +--- a/tools/testing/selftests/bpf/prog_tests/find_vma.c ++++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c +@@ -29,8 +29,8 @@ static int open_pe(void) + + /* create perf event */ + attr.size = sizeof(attr); +- attr.type = PERF_TYPE_HARDWARE; +- attr.config = PERF_COUNT_HW_CPU_CYCLES; ++ attr.type = PERF_TYPE_SOFTWARE; ++ attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 1000; + pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch new file mode 100644 index 0000000000000..d55d2e7af8651 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch @@ -0,0 +1,78 @@ +From 100888fb6d8a185866b1520031ee7e3182b173de Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 10 Nov 2023 11:36:44 -0800 +Subject: [PATCH] selftests/bpf: Fix pyperf180 compilation failure with clang18 + +With latest clang18 (main branch of llvm-project repo), when building bpf selftests, + [~/work/bpf-next (master)]$ make -C tools/testing/selftests/bpf LLVM=1 -j + +The following compilation error happens: + fatal error: error in backend: Branch target out of insn range + ... + Stack dump: + 0. Program arguments: clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -I/home/yhs/work/bpf-next/tools/include/uapi + -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -idirafter + /home/yhs/work/llvm-project/llvm/build.18/install/lib/clang/18/include -idirafter /usr/local/include + -idirafter /usr/include -Wno-compare-distinct-pointer-types -DENABLE_ATOMICS_TESTS -O2 --target=bpf + -c progs/pyperf180.c -mcpu=v3 -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/pyperf180.bpf.o + 1. parser at end of file + 2. Code generation + ... + +The compilation failure only happens to cpu=v2 and cpu=v3. cpu=v4 is okay +since cpu=v4 supports 32-bit branch target offset. + +The above failure is due to upstream llvm patch [1] where some inlining behavior +are changed in clang18. + +To workaround the issue, previously all 180 loop iterations are fully unrolled. +The bpf macro __BPF_CPU_VERSION__ (implemented in clang18 recently) is used to avoid +unrolling changes if cpu=v4. If __BPF_CPU_VERSION__ is not available and the +compiler is clang18, the unrollng amount is unconditionally reduced. + + [1] https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e + +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Tested-by: Alan Maguire +Link: https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev +--- + tools/testing/selftests/bpf/progs/pyperf180.c | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c +index c39f559d3100..42c4a8b62e36 100644 +--- a/tools/testing/selftests/bpf/progs/pyperf180.c ++++ b/tools/testing/selftests/bpf/progs/pyperf180.c +@@ -1,4 +1,26 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (c) 2019 Facebook + #define STACK_MAX_LEN 180 ++ ++/* llvm upstream commit at clang18 ++ * https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e ++ * changed inlining behavior and caused compilation failure as some branch ++ * target distance exceeded 16bit representation which is the maximum for ++ * cpu v1/v2/v3. Macro __BPF_CPU_VERSION__ is later implemented in clang18 ++ * to specify which cpu version is used for compilation. So a smaller ++ * unroll_count can be set if __BPF_CPU_VERSION__ is less than 4, which ++ * reduced some branch target distances and resolved the compilation failure. ++ * ++ * To capture the case where a developer/ci uses clang18 but the corresponding ++ * repo checkpoint does not have __BPF_CPU_VERSION__, a smaller unroll_count ++ * will be set as well to prevent potential compilation failures. ++ */ ++#ifdef __BPF_CPU_VERSION__ ++#if __BPF_CPU_VERSION__ < 4 ++#define UNROLL_COUNT 90 ++#endif ++#elif __clang_major__ == 18 ++#define UNROLL_COUNT 90 ++#endif ++ + #include "pyperf.h" +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch new file mode 100644 index 0000000000000..6497a6cc38c90 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch @@ -0,0 +1,41 @@ +From 42839864a62ee244ec280b09149b1cb439f681db Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Fri, 27 Oct 2023 18:25:39 -0700 +Subject: [PATCH bpf-next] selftests/bpf: disable detection of llvm when + building bpftool + +The VMs in which we run the selftests do not have llvm installed. +We build selftests/bpftool in a host that have llvm. +bpftool currently will use llvm first and fallback to libbfd but there +is no way to disable detection from the command line. + +Removing it from the feature detection should force us to use libbfd. + +Signed-off-by: Manu Bretelle +--- + tools/bpf/bpftool/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile +index e9154ace80ff..01314458e25e 100644 +--- a/tools/bpf/bpftool/Makefile ++++ b/tools/bpf/bpftool/Makefile +@@ -95,7 +95,6 @@ RM ?= rm -f + FEATURE_USER = .bpftool + + FEATURE_TESTS := clang-bpf-co-re +-FEATURE_TESTS += llvm + FEATURE_TESTS += libcap + FEATURE_TESTS += libbfd + FEATURE_TESTS += libbfd-liberty +@@ -104,7 +103,6 @@ FEATURE_TESTS += disassembler-four-args + FEATURE_TESTS += disassembler-init-styled + + FEATURE_DISPLAY := clang-bpf-co-re +-FEATURE_DISPLAY += llvm + FEATURE_DISPLAY += libcap + FEATURE_DISPLAY += libbfd + FEATURE_DISPLAY += libbfd-liberty +-- +2.39.3 + diff --git a/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch new file mode 100644 index 0000000000000..3fa007c51db68 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch @@ -0,0 +1,32 @@ +From 0daad0a615e687e1247230f3d0c31ae60ba32314 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 28 May 2024 15:29:38 -0700 +Subject: [PATCH bpf-next] selftests/bpf: fix inet_csk_accept prototype in + test_sk_storage_tracing.c + +Recent kernel change ([0]) changed inet_csk_accept() prototype. Adapt +progs/test_sk_storage_tracing.c to take that into account. + + [0] 92ef0fd55ac8 ("net: change proto and proto_ops accept type") + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +index 02e718f06e0f..40531e56776e 100644 +--- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c ++++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +@@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) + } + + SEC("fexit/inet_csk_accept") +-int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, ++int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, + struct sock *accepted_sk) + { + set_task_info(accepted_sk); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch new file mode 100644 index 0000000000000..ec1e29a8ab974 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch @@ -0,0 +1,31 @@ +From d31a7125891994681503770cff46a119692fb2b9 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Mon, 11 Dec 2023 17:09:38 -0800 +Subject: [PATCH 1/1] selftests/bpf: work around latest Clang smartness + +Work around the issue while we deal with it in the Clang itself. +See [0]. + + [0] https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758 + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/iters.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c +index 3aca3dc145b5..929ba6fa2105 100644 +--- a/tools/testing/selftests/bpf/progs/iters.c ++++ b/tools/testing/selftests/bpf/progs/iters.c +@@ -1420,7 +1420,7 @@ SEC("raw_tp") + __success + int iter_arr_with_actual_elem_count(const void *ctx) + { +- int i, n = loop_data.n, sum = 0; ++ unsigned i, n = loop_data.n, sum = 0; + + if (n > ARRAY_SIZE(loop_data.data)) + return 0; +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch new file mode 100644 index 0000000000000..e631fac0cc698 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch @@ -0,0 +1,89 @@ +From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 +From: Anders Roxell +Date: Thu, 9 Nov 2023 18:43:28 +0100 +Subject: [PATCH] selftests: bpf: xskxceiver: ksft_print_msg: fix format type + error + +Crossbuilding selftests/bpf for architecture arm64, format specifies +type error show up like. + +xskxceiver.c:912:34: error: format specifies type 'int' but the argument +has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", + ~~ + %llu + __func__, pkt->pkt_nb, meta->count); + ^~~~~~~~~~~ +xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but + the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ~~~~ ^~~~ + +Fixing the issues by casting to (unsigned long long) and changing the +specifiers to be %llu from %d and %u, since with u64s it might be %llx +or %lx, depending on architecture. + +Signed-off-by: Anders Roxell +Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c +index 591ca9637b23..b604c570309a 100644 +--- a/tools/testing/selftests/bpf/xskxceiver.c ++++ b/tools/testing/selftests/bpf/xskxceiver.c +@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { +- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", +- __func__, pkt->pkt_nb, meta->count); ++ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", ++ __func__, pkt->pkt_nb, ++ (unsigned long long)meta->count); + return false; + } + +@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { +- ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag invalid addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { +- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + +@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); +- ksft_print_msg("Last completion address: %llx\n", addr); ++ ksft_print_msg("Last completion address: %llx\n", ++ (unsigned long long)addr); + return TEST_FAILURE; + } + +@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { +- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", +- __func__, stats.tx_invalid_descs, ++ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", ++ __func__, ++ (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } +-- +2.34.1 + diff --git a/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..19d269de7e8ca --- /dev/null +++ b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,142 @@ +From 3772e6cdb51f21a11df2acf6aa431cc8b9137bfb Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:09 +0100 +Subject: [PATCH 1/2] tools/resolve_btfids: Refactor set sorting with types + from btf_ids.h + +Instead of using magic offsets to access BTF ID set data, leverage types +from btf_ids.h (btf_id_set and btf_id_set8) which define the actual +layout of the data. Thanks to this change, set sorting should also +continue working if the layout changes. + +This requires to sync the definition of 'struct btf_id_set8' from +include/linux/btf_ids.h to tools/include/linux/btf_ids.h. We don't sync +the rest of the file at the moment, b/c that would require to also sync +multiple dependent headers and we don't need any other defs from +btf_ids.h. + +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Acked-by: Daniel Xu +Link: https://lore.kernel.org/bpf/ff7f062ddf6a00815fda3087957c4ce667f50532.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 ++++++++++++++++++++------------- + tools/include/linux/btf_ids.h | 9 +++++++++ + 2 files changed, 30 insertions(+), 14 deletions(-) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 27a23196d58e..32634f00abba 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -70,6 +70,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -78,7 +79,7 @@ + #include + + #define BTF_IDS_SECTION ".BTF_ids" +-#define BTF_ID "__BTF_ID__" ++#define BTF_ID_PREFIX "__BTF_ID__" + + #define BTF_STRUCT "struct" + #define BTF_UNION "union" +@@ -161,7 +162,7 @@ static int eprintf(int level, int var, const char *fmt, ...) + + static bool is_btf_id(const char *name) + { +- return name && !strncmp(name, BTF_ID, sizeof(BTF_ID) - 1); ++ return name && !strncmp(name, BTF_ID_PREFIX, sizeof(BTF_ID_PREFIX) - 1); + } + + static struct btf_id *btf_id__find(struct rb_root *root, const char *name) +@@ -441,7 +442,7 @@ static int symbols_collect(struct object *obj) + * __BTF_ID__TYPE__vfs_truncate__0 + * prefix = ^ + */ +- prefix = name + sizeof(BTF_ID) - 1; ++ prefix = name + sizeof(BTF_ID_PREFIX) - 1; + + /* struct */ + if (!strncmp(prefix, BTF_STRUCT, sizeof(BTF_STRUCT) - 1)) { +@@ -649,19 +650,18 @@ static int cmp_id(const void *pa, const void *pb) + static int sets_patch(struct object *obj) + { + Elf_Data *data = obj->efile.idlist; +- int *ptr = data->d_buf; + struct rb_node *next; + + next = rb_first(&obj->sets); + while (next) { +- unsigned long addr, idx; ++ struct btf_id_set8 *set8; ++ struct btf_id_set *set; ++ unsigned long addr, off; + struct btf_id *id; +- int *base; +- int cnt; + + id = rb_entry(next, struct btf_id, rb_node); + addr = id->addr[0]; +- idx = addr - obj->efile.idlist_addr; ++ off = addr - obj->efile.idlist_addr; + + /* sets are unique */ + if (id->addr_cnt != 1) { +@@ -670,14 +670,21 @@ static int sets_patch(struct object *obj) + return -1; + } + +- idx = idx / sizeof(int); +- base = &ptr[idx] + (id->is_set8 ? 2 : 1); +- cnt = ptr[idx]; ++ if (id->is_set) { ++ set = data->d_buf + off; ++ qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); ++ } else { ++ set8 = data->d_buf + off; ++ /* ++ * Make sure id is at the beginning of the pairs ++ * struct, otherwise the below qsort would not work. ++ */ ++ BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); ++ qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +- (idx + 1) * sizeof(int), cnt, id->name); +- +- qsort(base, cnt, id->is_set8 ? sizeof(uint64_t) : sizeof(int), cmp_id); ++ off, id->is_set ? set->cnt : set8->cnt, id->name); + + next = rb_next(next); + } +diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h +index 2f882d5cb30f..72535f00572f 100644 +--- a/tools/include/linux/btf_ids.h ++++ b/tools/include/linux/btf_ids.h +@@ -8,6 +8,15 @@ struct btf_id_set { + u32 ids[]; + }; + ++struct btf_id_set8 { ++ u32 cnt; ++ u32 flags; ++ struct { ++ u32 id; ++ u32 flags; ++ } pairs[]; ++}; ++ + #ifdef CONFIG_DEBUG_INFO_BTF + + #include /* for __PASTE */ +-- +2.39.3 + + + diff --git a/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch new file mode 100644 index 0000000000000..24ebc231056cb --- /dev/null +++ b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch @@ -0,0 +1,65 @@ +From 08969a676d234a178ff9f8c67936a2ad98a741eb Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 27 Oct 2023 16:22:24 -0700 +Subject: [PATCH] tracing/kprobes: Fix symbol counting logic by looking at + modules as well + +Recent changes to count number of matching symbols when creating +a kprobe event failed to take into account kernel modules. As such, it +breaks kprobes on kernel module symbols, by assuming there is no match. + +Fix this my calling module_kallsyms_on_each_symbol() in addition to +kallsyms_on_each_match_symbol() to perform a proper counting. + +Cc: Francis Laniel +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Steven Rostedt +Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols") +Signed-off-by: Andrii Nakryiko +--- + kernel/trace/trace_kprobe.c | 24 ++++++++++++++++++++---- + 1 file changed, 20 insertions(+), 4 deletions(-) + +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index effcaede4759..1efb27f35963 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused) + return 0; + } + ++struct sym_count_ctx { ++ unsigned int count; ++ const char *name; ++}; ++ ++static int count_mod_symbols(void *data, const char *name, unsigned long unused) ++{ ++ struct sym_count_ctx *ctx = data; ++ ++ if (strcmp(name, ctx->name) == 0) ++ ctx->count++; ++ ++ return 0; ++} ++ + static unsigned int number_of_same_symbols(char *func_name) + { +- unsigned int count; ++ struct sym_count_ctx ctx = { .count = 0, .name = func_name }; ++ ++ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + +- count = 0; +- kallsyms_on_each_match_symbol(count_symbols, func_name, &count); ++ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + +- return count; ++ return ctx.count; + } + + static int __trace_kprobe_create(int argc, const char *argv[]) +-- +2.34.1 + diff --git a/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..c4d67693bd132 --- /dev/null +++ b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,117 @@ +From c3dcadfdf2bf8f01471066700c098b5185240df6 Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:10 +0100 +Subject: [PATCH 2/2] tools/resolve_btfids: Fix cross-compilation to non-host + endianness + +The .BTF_ids section is pre-filled with zeroed BTF ID entries during the +build and afterwards patched by resolve_btfids with correct values. +Since resolve_btfids always writes in host-native endianness, it relies +on libelf to do the translation when the target ELF is cross-compiled to +a different endianness (this was introduced in commit 61e8aeda9398 +("bpf: Fix libelf endian handling in resolv_btfids")). + +Unfortunately, the translation will corrupt the flags fields of SET8 +entries because these were written during vmlinux compilation and are in +the correct endianness already. This will lead to numerous selftests +failures such as: + + $ sudo ./test_verifier 502 502 + #502/p sleepable fentry accept FAIL + Failed to load prog 'Invalid argument'! + bpf_fentry_test1 is not sleepable + verification time 34 usec + stack depth 0 + processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 + Summary: 0 PASSED, 0 SKIPPED, 1 FAILED + +Since it's not possible to instruct libelf to translate just certain +values, let's manually bswap the flags (both global and entry flags) in +resolve_btfids when needed, so that libelf then translates everything +correctly. + +Fixes: ef2c6f370a63 ("tools/resolve_btfids: Add support for 8-byte BTF sets") +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/7b6bff690919555574ce0f13d2a5996cacf7bf69.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 +++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 32634f00abba..d9520cb826b3 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -90,6 +90,14 @@ + + #define ADDR_CNT 100 + ++#if __BYTE_ORDER == __LITTLE_ENDIAN ++# define ELFDATANATIVE ELFDATA2LSB ++#elif __BYTE_ORDER == __BIG_ENDIAN ++# define ELFDATANATIVE ELFDATA2MSB ++#else ++# error "Unknown machine endianness!" ++#endif ++ + struct btf_id { + struct rb_node rb_node; + char *name; +@@ -117,6 +125,7 @@ struct object { + int idlist_shndx; + size_t strtabidx; + unsigned long idlist_addr; ++ int encoding; + } efile; + + struct rb_root sets; +@@ -320,6 +329,7 @@ static int elf_collect(struct object *obj) + { + Elf_Scn *scn = NULL; + size_t shdrstrndx; ++ GElf_Ehdr ehdr; + int idx = 0; + Elf *elf; + int fd; +@@ -351,6 +361,13 @@ static int elf_collect(struct object *obj) + return -1; + } + ++ if (gelf_getehdr(obj->efile.elf, &ehdr) == NULL) { ++ pr_err("FAILED cannot get ELF header: %s\n", ++ elf_errmsg(-1)); ++ return -1; ++ } ++ obj->efile.encoding = ehdr.e_ident[EI_DATA]; ++ + /* + * Scan all the elf sections and look for save data + * from .BTF_ids section and symbols. +@@ -681,6 +698,24 @@ static int sets_patch(struct object *obj) + */ + BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ ++ /* ++ * When ELF endianness does not match endianness of the ++ * host, libelf will do the translation when updating ++ * the ELF. This, however, corrupts SET8 flags which are ++ * already in the target endianness. So, let's bswap ++ * them to the host endianness and libelf will then ++ * correctly translate everything. ++ */ ++ if (obj->efile.encoding != ELFDATANATIVE) { ++ int i; ++ ++ set8->flags = bswap_32(set8->flags); ++ for (i = 0; i < set8->cnt; i++) { ++ set8->pairs[i].flags = ++ bswap_32(set8->pairs[i].flags); ++ } ++ } + } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +-- +2.39.3 + diff --git a/ci/diffs/0099-s390x_nolockdep.diff b/ci/diffs/0099-s390x_nolockdep.diff new file mode 100644 index 0000000000000..44c2d1a520656 --- /dev/null +++ b/ci/diffs/0099-s390x_nolockdep.diff @@ -0,0 +1,48 @@ +From 470d0c7874ac638ea62cddc3a20ec047fa4ab539 Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Wed, 14 Feb 2024 17:25:35 -0800 +Subject: [PATCH] bpf/selftests: disable lockdep on s390x + +Tests are slow to run on s390x, this should make them faster. + +Signed-off-by: Manu Bretelle +--- + tools/testing/selftests/bpf/config.s390x | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x +index 706931a8c2c69..67bfd62b0b582 100644 +--- a/tools/testing/selftests/bpf/config.s390x ++++ b/tools/testing/selftests/bpf/config.s390x +@@ -23,11 +23,11 @@ CONFIG_CPUSETS=y + CONFIG_CRASH_DUMP=y + CONFIG_CRYPTO_USER_API_RNG=y + CONFIG_CRYPTO_USER_API_SKCIPHER=y +-CONFIG_DEBUG_ATOMIC_SLEEP=y ++CONFIG_DEBUG_ATOMIC_SLEEP=n + CONFIG_DEBUG_INFO_BTF=y + CONFIG_DEBUG_INFO_DWARF4=y + CONFIG_DEBUG_LIST=y +-CONFIG_DEBUG_LOCKDEP=y ++CONFIG_DEBUG_LOCKDEP=n + CONFIG_DEBUG_NOTIFIERS=y + CONFIG_DEBUG_PAGEALLOC=y + CONFIG_DEBUG_SECTION_MISMATCH=y +@@ -71,7 +71,7 @@ CONFIG_KRETPROBES=y + CONFIG_KSM=y + CONFIG_LATENCYTOP=y + CONFIG_LIVEPATCH=y +-CONFIG_LOCK_STAT=y ++CONFIG_LOCK_STAT=n + CONFIG_MACVLAN=y + CONFIG_MACVTAP=y + CONFIG_MAGIC_SYSRQ=y +@@ -101,7 +101,7 @@ CONFIG_PCI=y + CONFIG_POSIX_MQUEUE=y + CONFIG_PROC_KCORE=y + CONFIG_PROFILING=y +-CONFIG_PROVE_LOCKING=y ++CONFIG_PROVE_LOCKING=n + CONFIG_PTDUMP_DEBUGFS=y + CONFIG_RC_DEVICES=y + CONFIG_RC_LOOPBACK=y diff --git a/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch b/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch new file mode 100644 index 0000000000000..9702960f2b4c3 --- /dev/null +++ b/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch @@ -0,0 +1,41 @@ +From 5f799e4752d441263ae6656062e2fca98f51c6cb Mon Sep 17 00:00:00 2001 +From: Vasily Gorbik +Date: Wed, 2 Apr 2025 03:15:35 +0200 +Subject: [PATCH] scripts/sorttable: Fix endianness handling in build-time + mcount sort + +Kernel cross-compilation with BUILDTIME_MCOUNT_SORT produces zeroed +mcount values if the build-host endianness does not match the ELF +file endianness. + +The mcount values array is converted from ELF file +endianness to build-host endianness during initialization in +fill_relocs()/fill_addrs(). Avoid extra conversion of these values during +weak-function zeroing; otherwise, they do not match nm-parsed addresses +and all mcount values are zeroed out. + +Fixes: ef378c3b8233 ("scripts/sorttable: Zero out weak functions in mcount_loc table") +Reported-by: Ilya Leoshkevich +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/all/your-ad-here.call-01743522822-ext-4975@work.hours/ +Signed-off-by: Vasily Gorbik +--- + scripts/sorttable.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/sorttable.c b/scripts/sorttable.c +index 7b4b3714b1af..deed676bfe38 100644 +--- a/scripts/sorttable.c ++++ b/scripts/sorttable.c +@@ -857,7 +857,7 @@ static void *sort_mcount_loc(void *arg) + for (void *ptr = vals; ptr < vals + size; ptr += long_size) { + uint64_t key; + +- key = long_size == 4 ? r((uint32_t *)ptr) : r8((uint64_t *)ptr); ++ key = long_size == 4 ? *(uint32_t *)ptr : *(uint64_t *)ptr; + if (!find_func(key)) { + if (long_size == 4) + *(uint32_t *)ptr = 0; +-- +2.49.0 + diff --git a/ci/diffs/0099-selftest-cross-compile.diff b/ci/diffs/0099-selftest-cross-compile.diff new file mode 100644 index 0000000000000..e8732596bdb3f --- /dev/null +++ b/ci/diffs/0099-selftest-cross-compile.diff @@ -0,0 +1,13 @@ +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index a38a3001527c..af68528cc944 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -304,7 +304,7 @@ $(OUTPUT)/test_maps: $(TESTING_HELPERS) + $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) + $(OUTPUT)/xsk.o: $(BPFOBJ) + +-BPFTOOL ?= $(DEFAULT_BPFTOOL) ++BPFTOOL ?= $(TRUNNER_BPFTOOL) + $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ diff --git a/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch new file mode 100644 index 0000000000000..b81d22a35322c --- /dev/null +++ b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch @@ -0,0 +1,46 @@ +From 0d24852bd71ec85ca0016b6d6fc997e6a3381552 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Mon, 30 Sep 2024 11:55:00 -0700 +Subject: [PATCH] iov_iter: fix advancing slot in iter_folioq_get_pages() + +iter_folioq_get_pages() decides to advance to the next folioq slot when +it has reached the end of the current folio. However, it is checking +offset, which is the beginning of the current part, instead of +iov_offset, which is adjusted to the end of the current part, so it +doesn't advance the slot when it's supposed to. As a result, on the next +iteration, we'll use the same folio with an out-of-bounds offset and +return an unrelated page. + +This manifested as various crashes and other failures in 9pfs in drgn's +VM testing setup and BPF CI. + +Fixes: db0aa2e9566f ("mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios") +Link: https://lore.kernel.org/linux-fsdevel/20240923183432.1876750-1-chantr4@gmail.com/ +Tested-by: Manu Bretelle +Signed-off-by: Omar Sandoval +Link: https://lore.kernel.org/r/cbaf141ba6c0e2e209717d02746584072844841a.1727722269.git.osandov@fb.com +Tested-by: Eduard Zingerman +Tested-by: Leon Romanovsky +Tested-by: Joey Gouly +Acked-by: David Howells +Signed-off-by: Christian Brauner +--- + lib/iov_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/iov_iter.c b/lib/iov_iter.c +index 97003155b..1abb32c0d 100644 +--- a/lib/iov_iter.c ++++ b/lib/iov_iter.c +@@ -1033,7 +1033,7 @@ static ssize_t iter_folioq_get_pages(struct iov_iter *iter, + if (maxpages == 0 || extracted >= maxsize) + break; + +- if (offset >= fsize) { ++ if (iov_offset >= fsize) { + iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { +-- +2.34.1 + diff --git a/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch new file mode 100644 index 0000000000000..11aa3626f5a81 --- /dev/null +++ b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch @@ -0,0 +1,58 @@ +From affb32e4f056883f285f8535b766293b85752fb4 Mon Sep 17 00:00:00 2001 +From: Jiri Olsa +Date: Tue, 24 Sep 2024 13:07:30 +0200 +Subject: [PATCH] selftests/bpf: Fix uprobe consumer test + +With newly merged code the uprobe behaviour is slightly different +and affects uprobe consumer test. + +We no longer need to check if the uprobe object is still preserved +after removing last uretprobe, because it stays as long as there's +pending/installed uretprobe instance. + +This allows to run uretprobe consumers registered 'after' uprobe was +hit even if previous uretprobe got unregistered before being hit. + +The uprobe object will be now removed after the last uprobe ref is +released and in such case it's held by ri->uprobe (return instance) +which is released after the uretprobe is hit. + +Reported-by: Ihor Solodrai +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Tested-by: Ihor Solodrai +Closes: https://lore.kernel.org/bpf/w6U8Z9fdhjnkSp2UaFaV1fGqJXvfLEtDKEUyGDkwmoruDJ_AgF_c0FFhrkeKW18OqiP-05s9yDKiT6X-Ns-avN_ABf0dcUkXqbSJN1TQSXo=@pm.me/ +--- + .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +index 844f6fc8487b..c1ac813ff9ba 100644 +--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c ++++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +@@ -869,21 +869,14 @@ static void consumer_test(struct uprobe_multi_consumers *skel, + fmt = "prog 0/1: uprobe"; + } else { + /* +- * uprobe return is tricky ;-) +- * + * to trigger uretprobe consumer, the uretprobe needs to be installed, + * which means one of the 'return' uprobes was alive when probe was hit: + * + * idxs: 2/3 uprobe return in 'installed' mask +- * +- * in addition if 'after' state removes everything that was installed in +- * 'before' state, then uprobe kernel object goes away and return uprobe +- * is not installed and we won't hit it even if it's in 'after' state. + */ + unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ +- unsigned long probe_preserved = before & after; /* did uprobe go away */ + +- if (had_uretprobes && probe_preserved && test_bit(idx, after)) ++ if (had_uretprobes && test_bit(idx, after)) + val++; + fmt = "idx 2/3: uretprobe"; + } +-- +2.34.1 + diff --git a/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch b/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch new file mode 100644 index 0000000000000..ba37429396236 --- /dev/null +++ b/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch @@ -0,0 +1,231 @@ +From 5565144e82b97c5d2082ab19866836dfe5b2e592 Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Thu, 21 Nov 2024 13:20:46 -0800 +Subject: [PATCH] selftests/sched_ext: fix build after renames in sched_ext API + +The selftests are falining to build on current tip of bpf-next and +sched_ext [1]. This has broken BPF CI [2] after merge from upstream. + +Use appropriate function names in the selftests according to the +recent changes in the sched_ext API [3]. + +[1] +https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fc39fb56917bb3cb53e99560ca3612a84456ada2 +[2] https://github.com/kernel-patches/bpf/actions/runs/11959327258/job/33340923745 +[3] https://lore.kernel.org/all/20241109194853.580310-1-tj@kernel.org/ + +Signed-off-by: Ihor Solodrai +--- + .../testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c | 2 +- + .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- + .../selftests/sched_ext/enq_select_cpu_fails.bpf.c | 2 +- + tools/testing/selftests/sched_ext/exit.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/maximal.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c | 2 +- + .../testing/selftests/sched_ext/select_cpu_dispatch.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c | 8 ++++---- + 12 files changed, 19 insertions(+), 19 deletions(-) + +diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +index 37d9bf6fb745..6f4c3f5a1c5d 100644 +--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c ++++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ +- scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, ++ scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } +diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +index dffc97d9cdf1..e4a55027778f 100644 +--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c ++++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +@@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ +- scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, +- p->scx.dsq_vtime, 0); ++ scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, ++ p->scx.dsq_vtime, 0); + return cpu; + } + +diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +index 6a7db1502c29..6325bf76f47e 100644 +--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c ++++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +@@ -45,7 +45,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) + + target = bpf_get_prandom_u32() % nr_cpus; + +- scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } + +diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +index 1efb50d61040..a7cf868d5e31 100644 +--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c ++++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +@@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + SEC(".struct_ops.link") +diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c +index d75d4faf07f6..4bc36182d3ff 100644 +--- a/tools/testing/selftests/sched_ext/exit.bpf.c ++++ b/tools/testing/selftests/sched_ext/exit.bpf.c +@@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) + if (exit_point == EXIT_ENQUEUE) + EXIT_CLEANLY(); + +- scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) +@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) + if (exit_point == EXIT_DISPATCH) + EXIT_CLEANLY(); + +- scx_bpf_consume(DSQ_ID); ++ scx_bpf_dsq_move_to_local(DSQ_ID); + } + + void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +index 4d4cd8d966db..4c005fa71810 100644 +--- a/tools/testing/selftests/sched_ext/maximal.bpf.c ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + + void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) + { +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +@@ -28,7 +28,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) + + void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) + { +- scx_bpf_consume(SCX_DSQ_GLOBAL); ++ scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); + } + + void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +index f171ac470970..13d0f5be788d 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + } + scx_bpf_put_idle_cpumask(idle_mask); + +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + SEC(".struct_ops.link") +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +index 9efdbb7da928..815f1d5d61ac 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +@@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + saw_local = true; + } + +- scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); + } + + s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +index 59bfc4f36167..4bb99699e920 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +@@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + cpu = prev_cpu; + + dispatch: +- scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; + } + +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +index 3bbd5fcdfb18..2a75de11b2cf 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +@@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p + s32 prev_cpu, u64 wake_flags) + { + /* Dispatching to a random DSQ should fail. */ +- scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; + } +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +index 0fda57fe0ecf..99d075695c97 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +@@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p + s32 prev_cpu, u64 wake_flags) + { + /* Dispatching twice in a row is disallowed. */ +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; + } +diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +index e6c67bcf5e6e..bfcb96cd4954 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +@@ -2,8 +2,8 @@ + /* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from +- * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and +- * making the test a very basic vtime scheduler. ++ * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(), ++ * and making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet +@@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); + ddsp: +- scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); ++ scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; + } + + void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) + { +- if (scx_bpf_consume(VTIME_DSQ)) ++ if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) + consumed = true; + } + +-- +2.47.0 + diff --git a/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch new file mode 100644 index 0000000000000..1f95c3c237cb9 --- /dev/null +++ b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch @@ -0,0 +1,61 @@ +From 80a5958a52b86e39c1a1bf5f4702011c0cf6ab4f Mon Sep 17 00:00:00 2001 +From: Eduard Zingerman +Date: Mon, 2 Dec 2024 12:14:46 -0800 +Subject: [PATCH] samples/bpf: fix samples compilation + +Commit [0] breaks samples build. + +TODO: moar details here + +[0] 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") + +Signed-off-by: Eduard Zingerman +Signed-off-by: Ihor Solodrai +--- + samples/bpf/Makefile | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile +index bcf103a4c14f..ee10dbf1b471 100644 +--- a/samples/bpf/Makefile ++++ b/samples/bpf/Makefile +@@ -146,13 +146,14 @@ ifeq ($(ARCH), x86) + BPF_EXTRA_CFLAGS += -fcf-protection + endif + +-TPROGS_CFLAGS += -Wall -O2 +-TPROGS_CFLAGS += -Wmissing-prototypes +-TPROGS_CFLAGS += -Wstrict-prototypes +-TPROGS_CFLAGS += $(call try-run,\ ++COMMON_CFLAGS += -Wall -O2 ++COMMON_CFLAGS += -Wmissing-prototypes ++COMMON_CFLAGS += -Wstrict-prototypes ++COMMON_CFLAGS += $(call try-run,\ + printf "int main() { return 0; }" |\ + $(CC) -Werror -fsanitize=bounds -x c - -o "$$TMP",-fsanitize=bounds,) + ++TPROGS_CFLAGS += $(COMMON_CFLAGS) + TPROGS_CFLAGS += -I$(objtree)/usr/include + TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ + TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) +@@ -229,7 +230,7 @@ clean: + + $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) + # Fix up variables inherited from Kbuild that tools/ build system won't like +- $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ ++ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS="$(TPROGS_LDFLAGS)" srctree=$(BPF_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers +@@ -305,7 +306,7 @@ $(obj)/$(TRACE_HELPERS): TPROGS_CFLAGS := $(TPROGS_CFLAGS) -D__must_check= + -include $(BPF_SAMPLES_PATH)/Makefile.target + + VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ +- $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ ++ $(abspath $(if $(objtree),$(objtree)/vmlinux)) \ + $(abspath ./vmlinux) + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +-- +2.47.0 + diff --git a/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch new file mode 100644 index 0000000000000..7e38b1a151a10 --- /dev/null +++ b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch @@ -0,0 +1,46 @@ +From faf291ff4beaef8dedebd166f11f815cdee257dc Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:37 +0900 +Subject: [PATCH 2000/2001] s390: fgraph: Fix to remove + ftrace_test_recursion_trylock() + +Fix to remove ftrace_test_recursion_trylock() from ftrace_graph_func() +because commit d576aec24df9 ("fgraph: Get ftrace recursion lock in +function_graph_enter") has been moved it to function_graph_enter_regs() +already. + +Reported-by: Jiri Olsa +Closes: https://lore.kernel.org/all/Z5O0shrdgeExZ2kF@krava/ +Fixes: d576aec24df9 ("fgraph: Get ftrace recursion lock in function_graph_enter") +Signed-off-by: Masami Hiramatsu (Google) +Tested-by: Jiri Olsa +--- + arch/s390/kernel/ftrace.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c +index c0b2c97efefb..63ba6306632e 100644 +--- a/arch/s390/kernel/ftrace.c ++++ b/arch/s390/kernel/ftrace.c +@@ -266,18 +266,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) + { + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; +- int bit; + + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; +- bit = ftrace_test_recursion_trylock(ip, *parent); +- if (bit < 0) +- return; + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = (unsigned long)&return_to_handler; +- ftrace_test_recursion_unlock(bit); + } + + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +-- +2.48.1 + diff --git a/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch new file mode 100644 index 0000000000000..66483206f448f --- /dev/null +++ b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch @@ -0,0 +1,28 @@ +From 04fce0d606f59a62105729094013c4784492ec7b Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:48 +0900 +Subject: [PATCH 2001/2001] s390: tracing: Define ftrace_get_symaddr() for s390 + +Add ftrace_get_symaddr() for s390, which returns the symbol address +from ftrace's 'ip' parameter. + +Signed-off-by: Masami Hiramatsu (Google) +--- + arch/s390/include/asm/ftrace.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h +index a3b73a4f626e..185331e91f83 100644 +--- a/arch/s390/include/asm/ftrace.h ++++ b/arch/s390/include/asm/ftrace.h +@@ -51,6 +51,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) + { + return addr; + } ++#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip)) + + #include + +-- +2.48.1 + diff --git a/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch new file mode 100644 index 0000000000000..9b24de70e2846 --- /dev/null +++ b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch @@ -0,0 +1,75 @@ +From f44275e7155dc310d36516fc25be503da099781c Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Mon, 6 Jan 2025 20:17:31 +0000 +Subject: [PATCH] selftests/bpf: add -fno-strict-aliasing to BPF_CFLAGS + +Following the discussion at [1], set -fno-strict-aliasing flag for all +BPF object build rules. Remove now unnecessary -CFLAGS variables. + +[1] https://lore.kernel.org/bpf/20250106185447.951609-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Acked-by: Eduard Zingerman +Link: https://lore.kernel.org/r/20250106201728.1219791-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 28 +--------------------------- + 1 file changed, 1 insertion(+), 27 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index eb4d21651aa7..d5be2f94deef 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -54,21 +54,6 @@ PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) + LDLIBS += $(PCAP_LIBS) + CFLAGS += $(PCAP_CFLAGS) + +-# The following tests perform type punning and they may break strict +-# aliasing rules, which are exploited by both GCC and clang by default +-# while optimizing. This can lead to broken programs. +-progs/bind4_prog.c-CFLAGS := -fno-strict-aliasing +-progs/bind6_prog.c-CFLAGS := -fno-strict-aliasing +-progs/dynptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/linked_list_fail.c-CFLAGS := -fno-strict-aliasing +-progs/map_kptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/syscall.c-CFLAGS := -fno-strict-aliasing +-progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing +-progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing +-progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +-progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing +-progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing +- + # Some utility functions use LLVM libraries + jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) + +@@ -103,18 +88,6 @@ progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +-# The following tests do type-punning, via the __imm_insn macro, from +-# `struct bpf_insn' to long and then uses the value. This triggers an +-# "is used uninitialized" warning in GCC due to strict-aliasing +-# rules. +-progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing + endif + + ifneq ($(CLANG_CPUV4),) +@@ -474,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare + +-- +2.47.1 + diff --git a/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch new file mode 100644 index 0000000000000..127b2641dc223 --- /dev/null +++ b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch @@ -0,0 +1,63 @@ +From bab18c7db44d3aa6c84450095451580922359c7a Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Tue, 7 Jan 2025 23:58:18 +0000 +Subject: [PATCH] selftests/bpf: add -std=gnu11 to BPF_CFLAGS and CFLAGS + +Latest versions of GCC BPF use C23 standard by default. This causes +compilation errors in vmlinux.h due to bool types declarations. + +Add -std=gnu11 to BPF_CFLAGS and CFLAGS. This aligns with the version +of the standard used when building the kernel currently [1]. + +For more details see the discussions at [2] and [3]. + +[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Makefile#n465 +[2] https://lore.kernel.org/bpf/EYcXjcKDCJY7Yb0GGtAAb7nLKPEvrgWdvWpuNzXm2qi6rYMZDixKv5KwfVVMBq17V55xyC-A1wIjrqG3aw-Imqudo9q9X7D7nLU2gWgbN0w=@pm.me/ +[3] https://lore.kernel.org/bpf/20250106202715.1232864-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Link: https://lore.kernel.org/r/20250107235813.2964472-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index d5be2f94deef..ea9cee5de0f8 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -41,7 +41,7 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) + srctree := $(patsubst %/,%,$(dir $(srctree))) + endif + +-CFLAGS += -g $(OPT_FLAGS) -rdynamic \ ++CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ + -Wall -Werror -fno-omit-frame-pointer \ + $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ +@@ -447,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -std=gnu11 \ + -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare +@@ -787,9 +788,12 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ + + # Make sure we are able to include and link libbpf against c++. ++CXXFLAGS += $(CFLAGS) ++CXXFLAGS := $(subst -D_GNU_SOURCE=,,$(CXXFLAGS)) ++CXXFLAGS := $(subst -std=gnu11,-std=gnu++11,$(CXXFLAGS)) + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(CXXFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.47.1 + diff --git a/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch b/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch new file mode 100644 index 0000000000000..1c078304d58b3 --- /dev/null +++ b/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch @@ -0,0 +1,112 @@ +From b99f27e90268b1a814c13f8bd72ea1db448ea257 Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Thu, 13 Feb 2025 15:32:17 -0800 +Subject: [PATCH] selftests/bpf: Fix stdout race condition in traffic monitor + +Fix a race condition between the main test_progs thread and the traffic +monitoring thread. The traffic monitor thread tries to print a line +using multiple printf and use flockfile() to prevent the line from being +torn apart. Meanwhile, the main thread doing io redirection can reassign +or close stdout when going through tests. A deadlock as shown below can +happen. + + main traffic_monitor_thread + ==== ====================== + show_transport() + -> flockfile(stdout) + +stdio_hijack_init() +-> stdout = open_memstream(log_buf, log_cnt); + ... + env.subtest_state->stdout_saved = stdout; + + ... + funlockfile(stdout) +stdio_restore_cleanup() +-> fclose(env.subtest_state->stdout_saved); + +After the traffic monitor thread lock stdout, A new memstream can be +assigned to stdout by the main thread. Therefore, the traffic monitor +thread later will not be able to unlock the original stdout. As the +main thread tries to access the old stdout, it will hang indefinitely +as it is still locked by the traffic monitor thread. + +The deadlock can be reproduced by running test_progs repeatedly with +traffic monitor enabled: + +for ((i=1;i<=100;i++)); do + ./test_progs -a flow_dissector_skb* -m '*' +done + +Fix this by only calling printf once and remove flockfile()/funlockfile(). + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Link: https://patch.msgid.link/20250213233217.553258-1-ameryhung@gmail.com +--- + tools/testing/selftests/bpf/network_helpers.c | 33 ++++++++----------- + 1 file changed, 13 insertions(+), 20 deletions(-) + +diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c +index a4252e000428..737a952dcf80 100644 +--- a/tools/testing/selftests/bpf/network_helpers.c ++++ b/tools/testing/selftests/bpf/network_helpers.c +@@ -788,12 +788,13 @@ static const char *pkt_type_str(u16 pkt_type) + return "Unknown"; + } + ++#define MAX_FLAGS_STRLEN 21 + /* Show the information of the transport layer in the packet */ + static void show_transport(const u_char *packet, u16 len, u32 ifindex, + const char *src_addr, const char *dst_addr, + u16 proto, bool ipv6, u8 pkt_type) + { +- char *ifname, _ifname[IF_NAMESIZE]; ++ char *ifname, _ifname[IF_NAMESIZE], flags[MAX_FLAGS_STRLEN] = ""; + const char *transport_str; + u16 src_port, dst_port; + struct udphdr *udp; +@@ -834,29 +835,21 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + + /* TCP or UDP*/ + +- flockfile(stdout); ++ if (proto == IPPROTO_TCP) ++ snprintf(flags, MAX_FLAGS_STRLEN, "%s%s%s%s", ++ tcp->fin ? ", FIN" : "", ++ tcp->syn ? ", SYN" : "", ++ tcp->rst ? ", RST" : "", ++ tcp->ack ? ", ACK" : ""); ++ + if (ipv6) +- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d", ++ printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len); ++ dst_addr, dst_port, transport_str, len, flags); + else +- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d", ++ printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len); +- +- if (proto == IPPROTO_TCP) { +- if (tcp->fin) +- printf(", FIN"); +- if (tcp->syn) +- printf(", SYN"); +- if (tcp->rst) +- printf(", RST"); +- if (tcp->ack) +- printf(", ACK"); +- } +- +- printf("\n"); +- funlockfile(stdout); ++ dst_addr, dst_port, transport_str, len, flags); + } + + static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +-- +2.48.1 + diff --git a/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch b/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch new file mode 100644 index 0000000000000..3c0ad2dee6c0e --- /dev/null +++ b/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch @@ -0,0 +1,106 @@ +From aeb5bbb0253892f601702aee9fc00038824a1c59 Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:55 -0800 +Subject: [PATCH] selftests/bpf: Clean up call sites of stdio_restore() + +reset_affinity() and save_ns() are only called in run_one_test(). There is +no need to call stdio_restore() in reset_affinity() and save_ns() if +stdio_restore() is moved right after a test finishes in run_one_test(). + +Also remove an unnecessary check of env.stdout_saved in crash_handler() +by moving env.stdout_saved assignment to the beginning of main(). + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Acked-by: Eduard Zingerman +Link: https://patch.msgid.link/20250305182057.2802606-1-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/test_progs.c | 17 ++++++----------- + 1 file changed, 6 insertions(+), 11 deletions(-) + +diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c +index 0cb759632225..ab0f2fed3c58 100644 +--- a/tools/testing/selftests/bpf/test_progs.c ++++ b/tools/testing/selftests/bpf/test_progs.c +@@ -474,8 +474,6 @@ static void dump_test_log(const struct prog_test_def *test, + print_test_result(test, test_state); + } + +-static void stdio_restore(void); +- + /* A bunch of tests set custom affinity per-thread and/or per-process. Reset + * it after each test/sub-test. + */ +@@ -490,13 +488,11 @@ static void reset_affinity(void) + + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { +- stdio_restore(); + fprintf(stderr, "Failed to reset process affinity: %d!\n", err); + exit(EXIT_ERR_SETUP_INFRA); + } + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (err < 0) { +- stdio_restore(); + fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); + exit(EXIT_ERR_SETUP_INFRA); + } +@@ -514,7 +510,6 @@ static void save_netns(void) + static void restore_netns(void) + { + if (setns(env.saved_netns_fd, CLONE_NEWNET) == -1) { +- stdio_restore(); + perror("setns(CLONE_NEWNS)"); + exit(EXIT_ERR_SETUP_INFRA); + } +@@ -1270,8 +1265,7 @@ void crash_handler(int signum) + + sz = backtrace(bt, ARRAY_SIZE(bt)); + +- if (env.stdout_saved) +- stdio_restore(); ++ stdio_restore(); + if (env.test) { + env.test_state->error_cnt++; + dump_test_log(env.test, env.test_state, true, false, NULL); +@@ -1400,6 +1394,8 @@ static void run_one_test(int test_num) + + state->tested = true; + ++ stdio_restore(); ++ + if (verbose() && env.worker_id == -1) + print_test_result(test, state); + +@@ -1408,7 +1404,6 @@ static void run_one_test(int test_num) + if (test->need_cgroup_cleanup) + cleanup_cgroup_environment(); + +- stdio_restore(); + free(stop_libbpf_log_capture()); + + dump_test_log(test, state, false, false, NULL); +@@ -1943,6 +1938,9 @@ int main(int argc, char **argv) + + sigaction(SIGSEGV, &sigact, NULL); + ++ env.stdout_saved = stdout; ++ env.stderr_saved = stderr; ++ + env.secs_till_notify = 10; + env.secs_till_kill = 120; + err = argp_parse(&argp, argc, argv, 0, NULL, &env); +@@ -1969,9 +1967,6 @@ int main(int argc, char **argv) + return -1; + } + +- env.stdout_saved = stdout; +- env.stderr_saved = stderr; +- + env.has_testmod = true; + if (!env.list_test_names) { + /* ensure previous instance of the module is unloaded */ +-- +2.48.1 + diff --git a/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch b/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch new file mode 100644 index 0000000000000..746d1d6210b1f --- /dev/null +++ b/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch @@ -0,0 +1,179 @@ +From 34a25aabcdea5c6e42a283381f8354d70592744a Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:56 -0800 +Subject: [PATCH] selftests/bpf: Allow assigning traffic monitor print function + +Allow users to change traffic monitor's print function. If not provided, +traffic monitor will print to stdout by default. + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Link: https://patch.msgid.link/20250305182057.2802606-2-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/network_helpers.c | 69 ++++++++++++++----- + tools/testing/selftests/bpf/network_helpers.h | 9 +++ + 2 files changed, 59 insertions(+), 19 deletions(-) + +diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c +index 737a952dcf80..97fb7caf95f4 100644 +--- a/tools/testing/selftests/bpf/network_helpers.c ++++ b/tools/testing/selftests/bpf/network_helpers.c +@@ -750,6 +750,36 @@ struct tmonitor_ctx { + int pcap_fd; + }; + ++static int __base_pr(const char *format, va_list args) ++{ ++ return vfprintf(stdout, format, args); ++} ++ ++static tm_print_fn_t __tm_pr = __base_pr; ++ ++tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) ++{ ++ tm_print_fn_t old_print_fn; ++ ++ old_print_fn = __atomic_exchange_n(&__tm_pr, fn, __ATOMIC_RELAXED); ++ ++ return old_print_fn; ++} ++ ++void tm_print(const char *format, ...) ++{ ++ tm_print_fn_t print_fn; ++ va_list args; ++ ++ print_fn = __atomic_load_n(&__tm_pr, __ATOMIC_RELAXED); ++ if (!print_fn) ++ return; ++ ++ va_start(args, format); ++ print_fn(format, args); ++ va_end(args); ++} ++ + /* Is this packet captured with a Ethernet protocol type? */ + static bool is_ethernet(const u_char *packet) + { +@@ -767,7 +797,7 @@ static bool is_ethernet(const u_char *packet) + case 770: /* ARPHRD_FRAD */ + case 778: /* ARPHDR_IPGRE */ + case 803: /* ARPHRD_IEEE80211_RADIOTAP */ +- printf("Packet captured: arphdr_type=%d\n", arphdr_type); ++ tm_print("Packet captured: arphdr_type=%d\n", arphdr_type); + return false; + } + return true; +@@ -817,19 +847,19 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + dst_port = ntohs(tcp->dest); + transport_str = "TCP"; + } else if (proto == IPPROTO_ICMP) { +- printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", +- ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, +- packet[0], packet[1]); ++ tm_print("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", ++ ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, ++ packet[0], packet[1]); + return; + } else if (proto == IPPROTO_ICMPV6) { +- printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", +- ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, +- packet[0], packet[1]); ++ tm_print("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", ++ ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, ++ packet[0], packet[1]); + return; + } else { +- printf("%-7s %-3s %s %s > %s: protocol %d\n", +- ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", +- src_addr, dst_addr, proto); ++ tm_print("%-7s %-3s %s %s > %s: protocol %d\n", ++ ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", ++ src_addr, dst_addr, proto); + return; + } + +@@ -843,13 +873,13 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + tcp->ack ? ", ACK" : ""); + + if (ipv6) +- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", +- ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len, flags); ++ tm_print("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", ++ ifname, pkt_type_str(pkt_type), src_addr, src_port, ++ dst_addr, dst_port, transport_str, len, flags); + else +- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", +- ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len, flags); ++ tm_print("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", ++ ifname, pkt_type_str(pkt_type), src_addr, src_port, ++ dst_addr, dst_port, transport_str, len, flags); + } + + static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +@@ -964,8 +994,8 @@ static void *traffic_monitor_thread(void *arg) + ifname = _ifname; + } + +- printf("%-7s %-3s Unknown network protocol type 0x%x\n", +- ifname, pkt_type_str(ptype), proto); ++ tm_print("%-7s %-3s Unknown network protocol type 0x%x\n", ++ ifname, pkt_type_str(ptype), proto); + } + } + +@@ -1165,8 +1195,9 @@ void traffic_monitor_stop(struct tmonitor_ctx *ctx) + write(ctx->wake_fd, &w, sizeof(w)); + pthread_join(ctx->thread, NULL); + +- printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); ++ tm_print("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + + traffic_monitor_release(ctx); + } ++ + #endif /* TRAFFIC_MONITOR */ +diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h +index 9f6e05d886c5..a4e20c12c57e 100644 +--- a/tools/testing/selftests/bpf/network_helpers.h ++++ b/tools/testing/selftests/bpf/network_helpers.h +@@ -17,6 +17,7 @@ typedef __u16 __sum16; + #include + #include + #include ++#include + + #define MAGIC_VAL 0x1234 + #define NUM_ITER 100000 +@@ -249,10 +250,13 @@ static inline __sum16 build_udp_v6_csum(const struct ipv6hdr *ip6h, + + struct tmonitor_ctx; + ++typedef int (*tm_print_fn_t)(const char *format, va_list args); ++ + #ifdef TRAFFIC_MONITOR + struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name); + void traffic_monitor_stop(struct tmonitor_ctx *ctx); ++tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn); + #else + static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +@@ -263,6 +267,11 @@ static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, cons + static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) + { + } ++ ++static inline tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) ++{ ++ return NULL; ++} + #endif + + #endif +-- +2.48.1 + diff --git a/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch b/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch new file mode 100644 index 0000000000000..ad9f297ed14e9 --- /dev/null +++ b/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch @@ -0,0 +1,131 @@ +From 8bda5b787dea43a7102d5797756c60d0a905932a Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:57 -0800 +Subject: [PATCH] selftests/bpf: Fix dangling stdout seen by traffic monitor + thread + +Traffic monitor thread may see dangling stdout as the main thread closes +and reassigns stdout without protection. This happens when the main thread +finishes one subtest and moves to another one in the same netns_new() +scope. + +The issue can be reproduced by running test_progs repeatedly with traffic +monitor enabled: + +for ((i=1;i<=100;i++)); do + ./test_progs -a flow_dissector_skb* -m '*' +done + +For restoring stdout in crash_handler(), since it does not really care +about closing stdout, simlpy flush stdout and restore it to the original +one. + +Then, Fix the issue by consolidating stdio_restore_cleanup() and +stdio_restore(), and protecting the use/close/assignment of stdout with +a lock. The locking in the main thread is always performed regradless of +whether traffic monitor is running or not for simplicity. It won't have +any side-effect. + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Acked-by: Eduard Zingerman +Link: https://patch.msgid.link/20250305182057.2802606-3-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/test_progs.c | 39 +++++++++++++----------- + 1 file changed, 22 insertions(+), 17 deletions(-) + +diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c +index ab0f2fed3c58..d4ec9586b98c 100644 +--- a/tools/testing/selftests/bpf/test_progs.c ++++ b/tools/testing/selftests/bpf/test_progs.c +@@ -88,7 +88,9 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) + #endif + } + +-static void stdio_restore_cleanup(void) ++static pthread_mutex_t stdout_lock = PTHREAD_MUTEX_INITIALIZER; ++ ++static void stdio_restore(void) + { + #ifdef __GLIBC__ + if (verbose() && env.worker_id == -1) { +@@ -98,6 +100,8 @@ static void stdio_restore_cleanup(void) + + fflush(stdout); + ++ pthread_mutex_lock(&stdout_lock); ++ + if (env.subtest_state) { + fclose(env.subtest_state->stdout_saved); + env.subtest_state->stdout_saved = NULL; +@@ -106,26 +110,21 @@ static void stdio_restore_cleanup(void) + } else { + fclose(env.test_state->stdout_saved); + env.test_state->stdout_saved = NULL; ++ stdout = env.stdout_saved; ++ stderr = env.stderr_saved; + } ++ ++ pthread_mutex_unlock(&stdout_lock); + #endif + } + +-static void stdio_restore(void) ++static int traffic_monitor_print_fn(const char *format, va_list args) + { +-#ifdef __GLIBC__ +- if (verbose() && env.worker_id == -1) { +- /* nothing to do, output to stdout by default */ +- return; +- } +- +- if (stdout == env.stdout_saved) +- return; +- +- stdio_restore_cleanup(); ++ pthread_mutex_lock(&stdout_lock); ++ vfprintf(stdout, format, args); ++ pthread_mutex_unlock(&stdout_lock); + +- stdout = env.stdout_saved; +- stderr = env.stderr_saved; +-#endif ++ return 0; + } + + /* Adapted from perf/util/string.c */ +@@ -536,7 +535,8 @@ void test__end_subtest(void) + test_result(subtest_state->error_cnt, + subtest_state->skipped)); + +- stdio_restore_cleanup(); ++ stdio_restore(); ++ + env.subtest_state = NULL; + } + +@@ -1265,7 +1265,10 @@ void crash_handler(int signum) + + sz = backtrace(bt, ARRAY_SIZE(bt)); + +- stdio_restore(); ++ fflush(stdout); ++ stdout = env.stdout_saved; ++ stderr = env.stderr_saved; ++ + if (env.test) { + env.test_state->error_cnt++; + dump_test_log(env.test, env.test_state, true, false, NULL); +@@ -1957,6 +1960,8 @@ int main(int argc, char **argv) + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); + ++ traffic_monitor_set_print(traffic_monitor_print_fn); ++ + srand(time(NULL)); + + env.jit_enabled = is_jit_enabled(); +-- +2.48.1 + diff --git a/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch new file mode 100644 index 0000000000000..31bc3fb4717ec --- /dev/null +++ b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch @@ -0,0 +1,100 @@ +From a99b3ef16d4473f8f9b7547f89791d037112a7e6 Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Wed, 12 Feb 2025 22:24:01 +0000 +Subject: [PATCH] netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all + subreqs queued + +Due to the code that queues a subreq on the active subrequest list getting +moved to netfs_issue_read(), the NETFS_RREQ_ALL_QUEUED flag may now get set +before the list-add actually happens. This is not a problem if the +collection worker happens after the list-add, but it's a race - and, for +9P, where the read from the server is synchronous and done in the +submitting thread, this is a lot more likely. + +The result is that, if the timing is wrong, a ref gets leaked because the +collector thinks that all the subreqs have completed (because it can't see +the last one yet) and clears NETFS_RREQ_IN_PROGRESS - at which point, the +collection worker no longer goes into the collector. + +This can be provoked with AFS by injecting an msleep() right before the +final subreq is queued. + +Fix this by splitting the queuing part out of netfs_issue_read() into a new +function, netfs_queue_read(), and calling it separately. The setting of +NETFS_RREQ_ALL_QUEUED is then done by netfs_queue_read() whilst it is +holding the spinlock (that's probably unnecessary, but shouldn't hurt). + +It might be better to set a flag on the final subreq, but this could be a +problem if an error occurs and we can't queue it. + +Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/r/a7x33d4dnMdGTtRivptq6S1i8btK70SNBP2XyX_xwDAhLvgQoPox6FVBOkifq4eBinfFfbZlIkMZBe3QarlWTxoEtHZwJCZbNKtaqrR7PvI=@pm.me/ +Signed-off-by: David Howells +Tested-by: Ihor Solodrai +cc: Eric Van Hensbergen +cc: Latchesar Ionkov +cc: Dominique Martinet +cc: Christian Schoenebeck +cc: Marc Dionne +cc: Steve French +cc: Paulo Alcantara +cc: Jeff Layton +cc: v9fs@lists.linux.dev +cc: linux-cifs@vger.kernel.org +cc: netfs@lists.linux.dev +cc: linux-fsdevel@vger.kernel.org +--- + fs/netfs/buffered_read.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c +index f761d44b3436..0d1b6d35ff3b 100644 +--- a/fs/netfs/buffered_read.c ++++ b/fs/netfs/buffered_read.c +@@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, + netfs_cache_read_terminated, subreq); + } + +-static void netfs_issue_read(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq) ++static void netfs_queue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq, ++ bool last_subreq) + { + struct netfs_io_stream *stream = &rreq->io_streams[0]; + +@@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, + } + } + ++ if (last_subreq) { ++ smp_wmb(); /* Write lists before ALL_QUEUED. */ ++ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); ++ } ++ + spin_unlock(&rreq->lock); ++} + ++static void netfs_issue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ + switch (subreq->source) { + case NETFS_DOWNLOAD_FROM_SERVER: + rreq->netfs_ops->issue_read(subreq); +@@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) + } + size -= slice; + start += slice; +- if (size <= 0) { +- smp_wmb(); /* Write lists before ALL_QUEUED. */ +- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); +- } + ++ netfs_queue_read(rreq, subreq, size <= 0); + netfs_issue_read(rreq, subreq); + cond_resched(); + } while (size > 0); +-- +2.48.1 + diff --git a/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch b/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch new file mode 100644 index 0000000000000..0f65cb47ce2e6 --- /dev/null +++ b/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch @@ -0,0 +1,56 @@ +From 10e1d78546b3dd4ea9d773c0b0257064a99211e9 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Wed, 11 Dec 2024 11:01:51 -1000 +Subject: [PATCH] sched_ext: Fix invalid irq restore in scx_ops_bypass() + +While adding outer irqsave/restore locking, 0e7ffff1b811 ("scx: Fix raciness +in scx_ops_bypass()") forgot to convert an inner rq_unlock_irqrestore() to +rq_unlock() which could re-enable IRQ prematurely leading to the following +warning: + + raw_local_irq_restore() called with IRQs enabled + WARNING: CPU: 1 PID: 96 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x30/0x40 + ... + Sched_ext: create_dsq (enabling) + pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) + pc : warn_bogus_irq_restore+0x30/0x40 + lr : warn_bogus_irq_restore+0x30/0x40 + ... + Call trace: + warn_bogus_irq_restore+0x30/0x40 (P) + warn_bogus_irq_restore+0x30/0x40 (L) + scx_ops_bypass+0x224/0x3b8 + scx_ops_enable.isra.0+0x2c8/0xaa8 + bpf_scx_reg+0x18/0x30 + ... + irq event stamp: 33739 + hardirqs last enabled at (33739): [] scx_ops_bypass+0x174/0x3b8 + hardirqs last disabled at (33738): [] _raw_spin_lock_irqsave+0xb4/0xd8 + +Drop the stray _irqrestore(). + +Signed-off-by: Tejun Heo +Reported-by: Ihor Solodrai +Link: http://lkml.kernel.org/r/qC39k3UsonrBYD_SmuxHnZIQLsuuccoCrkiqb_BT7DvH945A1_LZwE4g-5Pu9FcCtqZt4lY1HhIPi0homRuNWxkgo1rgP3bkxa0donw8kV4=@pm.me +Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") +Cc: stable@vger.kernel.org # v6.12 +--- + kernel/sched/ext.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 7fff1d045477..98519e6d0dcd 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) + * sees scx_rq_bypassing() before moving tasks to SCX. + */ + if (!scx_enabled()) { +- rq_unlock_irqrestore(rq, &rf); ++ rq_unlock(rq, &rf); + continue; + } + +-- +2.47.1 + diff --git a/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch b/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch new file mode 100644 index 0000000000000..9b5e6d50778e1 --- /dev/null +++ b/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch @@ -0,0 +1,56 @@ +From 70414cacbe536a197d56f58a42f9563e5a01b8ec Mon Sep 17 00:00:00 2001 +From: David Vernet +Date: Mon, 9 Dec 2024 09:29:24 -0600 +Subject: [PATCH] scx: Fix maximal BPF selftest prog + +maximal.bpf.c is still dispatching to and consuming from SCX_DSQ_GLOBAL. +Let's have it use its own DSQ to avoid any runtime errors. + +Signed-off-by: David Vernet +--- + tools/testing/selftests/sched_ext/maximal.bpf.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +index 4c005fa71810..430f5e13bf55 100644 +--- a/tools/testing/selftests/sched_ext/maximal.bpf.c ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -12,6 +12,8 @@ + + char _license[] SEC("license") = "GPL"; + ++#define DSQ_ID 0 ++ + s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) + { +@@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + + void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) + { +- scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +@@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) + + void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) + { +- scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); ++ scx_bpf_dsq_move_to_local(DSQ_ID); + } + + void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +@@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) + + s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) + { +- return 0; ++ return scx_bpf_create_dsq(DSQ_ID, -1); + } + + void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +-- +2.47.1 + diff --git a/ci/vmtest/configs/DENYLIST b/ci/vmtest/configs/DENYLIST new file mode 100644 index 0000000000000..752e667505816 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST @@ -0,0 +1,16 @@ +# TEMPORARY +btf_dump/btf_dump: syntax +kprobe_multi_bench_attach +core_reloc/enum64val +core_reloc/size___diff_sz +core_reloc/type_based___diff_sz +test_ima # All of CI is broken on it following 6.3-rc1 merge + +lwt_reroute # crashes kernel after netnext merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +tc_links_ingress # started failing after net-next merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +xdp_bonding/xdp_bonding_features # started failing after net merge from 359e54a93ab4 "l2tp: pass correct message length to ip6_append_data" +tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets") +migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +connect_force_port # unreliably fails +sockmap_ktls/sockmap_ktls disconnect_after_delete* # https://lore.kernel.org/bpf/20250415163332.1836826-1-ihor.solodrai@linux.dev/ diff --git a/ci/vmtest/configs/DENYLIST.aarch64 b/ci/vmtest/configs/DENYLIST.aarch64 new file mode 100644 index 0000000000000..bdce99f3855ec --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.aarch64 @@ -0,0 +1,5 @@ +cgrp_local_storage # libbpf: prog 'update_cookie_tracing': failed to attach: ERROR: strerror_r(-524)=22 +core_reloc_btfgen # run_core_reloc_tests:FAIL:run_btfgen unexpected error: 32512 (errno 22) +usdt/multispec # usdt_300_bad_attach unexpected pointer: 0x558c63d8f0 +xdp_bonding # whole test suite is very unstable on aarch64 +res_spin_lock_success # flaky diff --git a/ci/vmtest/configs/DENYLIST.rc b/ci/vmtest/configs/DENYLIST.rc new file mode 100644 index 0000000000000..8aa33e6b71443 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.rc @@ -0,0 +1,3 @@ +send_signal/send_signal_nmi # PMU events configure correctly but don't trigger NMI's for some reason (AMD nested virt) +send_signal/send_signal_nmi_thread # Same as above +token/obj_priv_implicit_token_envvar # Unknown root cause, but reliably fails diff --git a/ci/vmtest/configs/DENYLIST.s390x b/ci/vmtest/configs/DENYLIST.s390x new file mode 100644 index 0000000000000..9b90b615aea55 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.s390x @@ -0,0 +1,11 @@ +deny_namespace # not yet in bpf denylist +tc_redirect/tc_redirect_dtime # very flaky +lru_bug # not yet in bpf-next denylist +# Disabled temporarily for a crash. +# https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ +dummy_st_ops/dummy_init_ptr_arg +fexit_bpf2bpf +tailcalls +trace_ext +xdp_bpf2bpf +xdp_metadata diff --git a/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc new file mode 100644 index 0000000000000..a3c745d1f5b52 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc @@ -0,0 +1,904 @@ +arena_htab +async_stack_depth +bad_struct_ops/invalid_prog_reuse +bpf_cookie +bpf_iter/bpf_hash_map +bpf_iter/ksym +bpf_iter/tcp4 +bpf_iter/tcp6 +bpf_iter/udp4 +bpf_iter/udp6 +bpf_iter/unix +bpf_iter_setsockopt +bpf_iter_setsockopt_unix +bpf_mod_race +bpf_nf/tc-bpf-ct +bpf_nf/xdp-ct +bpf_tcp_ca/cubic +btf_dump/btf_dump: bitfields +btf_dump/btf_dump: packing +btf_dump/btf_dump: padding +btf_dump/btf_dump: syntax +btf_map_in_map +cb_refs +cgroup_get_current_cgroup_id +cgroup_iter/cgroup_iter__self_only_css_task +cgroup_tcp_skb +cgrp_kfunc +cls_redirect/cls_redirect_dynptr +connect_force_port +core_autosize +core_read_macros +core_reloc/type_id +core_reloc/type_id___missing_targets +core_reloc_btfgen/type_id +core_reloc_btfgen/type_id___missing_targets +cpumask/test_acquire_wrong_cpumask +cpumask/test_alloc_double_release +cpumask/test_alloc_free_cpumask +cpumask/test_alloc_no_release +cpumask/test_and_or_xor +cpumask/test_copy_any_anyand +cpumask/test_cpumask_null +cpumask/test_cpumask_weight +cpumask/test_first_firstzero_cpu +cpumask/test_firstand_nocpu +cpumask/test_global_mask_array_l2_rcu +cpumask/test_global_mask_array_one_rcu +cpumask/test_global_mask_array_rcu +cpumask/test_global_mask_nested_deep_array_rcu +cpumask/test_global_mask_nested_deep_rcu +cpumask/test_global_mask_nested_rcu +cpumask/test_global_mask_no_null_check +cpumask/test_global_mask_out_of_rcu +cpumask/test_global_mask_rcu +cpumask/test_global_mask_rcu_no_null_check +cpumask/test_insert_leave +cpumask/test_insert_remove_no_release +cpumask/test_insert_remove_release +cpumask/test_intersects_subset +cpumask/test_invalid_nested_array +cpumask/test_mutate_cpumask +cpumask/test_set_clear_cpu +cpumask/test_setall_clear_cpu +cpumask/test_test_and_set_clear +crypto_basic/crypto_acquire +crypto_sanity +deny_namespace +dummy_st_ops/test_unsupported_field_sleepable +dynptr/add_dynptr_to_map1 +dynptr/add_dynptr_to_map2 +dynptr/clone_invalid1 +dynptr/clone_invalid2 +dynptr/clone_invalidate1 +dynptr/clone_invalidate2 +dynptr/clone_invalidate3 +dynptr/clone_invalidate4 +dynptr/clone_invalidate5 +dynptr/clone_invalidate6 +dynptr/clone_skb_packet_data +dynptr/clone_xdp_packet_data +dynptr/data_slice_missing_null_check1 +dynptr/data_slice_missing_null_check2 +dynptr/data_slice_out_of_bounds_map_value +dynptr/data_slice_out_of_bounds_ringbuf +dynptr/data_slice_out_of_bounds_skb +dynptr/data_slice_use_after_release1 +dynptr/data_slice_use_after_release2 +dynptr/dynptr_adjust_invalid +dynptr/dynptr_from_mem_invalid_api +dynptr/dynptr_invalidate_slice_failure +dynptr/dynptr_invalidate_slice_or_null +dynptr/dynptr_invalidate_slice_reinit +dynptr/dynptr_is_null_invalid +dynptr/dynptr_is_rdonly_invalid +dynptr/dynptr_overwrite_ref +dynptr/dynptr_partial_slot_invalidate +dynptr/dynptr_pruning_overwrite +dynptr/dynptr_pruning_type_confusion +dynptr/dynptr_read_into_slot +dynptr/dynptr_size_invalid +dynptr/dynptr_slice_var_len1 +dynptr/dynptr_slice_var_len2 +dynptr/dynptr_var_off_overwrite +dynptr/global +dynptr/invalid_data_slices +dynptr/invalid_helper1 +dynptr/invalid_helper2 +dynptr/invalid_offset +dynptr/invalid_read1 +dynptr/invalid_read2 +dynptr/invalid_read3 +dynptr/invalid_read4 +dynptr/invalid_slice_rdwr_rdonly +dynptr/invalid_write1 +dynptr/invalid_write2 +dynptr/invalid_write3 +dynptr/invalid_write4 +dynptr/release_twice +dynptr/release_twice_callback +dynptr/ringbuf_invalid_api +dynptr/ringbuf_missing_release1 +dynptr/ringbuf_missing_release2 +dynptr/ringbuf_missing_release_callback +dynptr/ringbuf_release_uninit_dynptr +dynptr/skb_invalid_ctx +dynptr/skb_invalid_ctx_fentry +dynptr/skb_invalid_ctx_fexit +dynptr/skb_invalid_data_slice1 +dynptr/skb_invalid_data_slice2 +dynptr/skb_invalid_data_slice3 +dynptr/skb_invalid_data_slice4 +dynptr/skb_invalid_slice_write +dynptr/test_dynptr_reg_type +dynptr/test_dynptr_skb_no_buff +dynptr/test_dynptr_skb_small_buff +dynptr/test_dynptr_skb_tp_btf +dynptr/test_read_write +dynptr/uninit_write_into_slot +dynptr/use_after_invalid +dynptr/xdp_invalid_ctx +dynptr/xdp_invalid_data_slice1 +dynptr/xdp_invalid_data_slice2 +exceptions/check_assert_eq_int_max +exceptions/check_assert_eq_int_min +exceptions/check_assert_eq_llong_max +exceptions/check_assert_eq_llong_min +exceptions/check_assert_eq_zero +exceptions/check_assert_ge_neg +exceptions/check_assert_ge_pos +exceptions/check_assert_ge_zero +exceptions/check_assert_generic +exceptions/check_assert_gt_neg +exceptions/check_assert_gt_pos +exceptions/check_assert_gt_zero +exceptions/check_assert_le_neg +exceptions/check_assert_le_pos +exceptions/check_assert_le_zero +exceptions/check_assert_lt_neg +exceptions/check_assert_lt_pos +exceptions/check_assert_lt_zero +exceptions/check_assert_range_s64 +exceptions/check_assert_range_u64 +exceptions/check_assert_single_range_s64 +exceptions/check_assert_single_range_u64 +exceptions/check_assert_with_return +exceptions/exception_ext +exceptions/exception_ext_mod_cb_runtime +exceptions/non-throwing extension -> non-throwing subprog +exceptions/non-throwing extension -> throwing global subprog +exceptions/non-throwing fentry -> exception_cb +exceptions/non-throwing fexit -> exception_cb +exceptions/non-throwing fmod_ret -> non-throwing global subprog +exceptions/reject_async_callback_throw +exceptions/reject_exception_throw_cb +exceptions/reject_exception_throw_cb_diff +exceptions/reject_set_exception_cb_bad_ret2 +exceptions/reject_subprog_with_lock +exceptions/reject_subprog_with_rcu_read_lock +exceptions/reject_with_cb +exceptions/reject_with_cb_reference +exceptions/reject_with_lock +exceptions/reject_with_rbtree_add_throw +exceptions/reject_with_rcu_read_lock +exceptions/reject_with_reference +exceptions/reject_with_subprog_reference +exceptions/throwing extension (with custom cb) -> exception_cb +exceptions/throwing extension -> global func in exception_cb +exceptions/throwing extension -> non-throwing global subprog +exceptions/throwing extension -> throwing global subprog +exceptions/throwing fentry -> exception_cb +exceptions/throwing fexit -> exception_cb +failures_wq +fexit_bpf2bpf/fmod_ret_freplace +fexit_bpf2bpf/func_replace +fexit_bpf2bpf/func_replace_global_func +fexit_bpf2bpf/func_replace_multi +fexit_bpf2bpf/func_sockmap_update +fexit_bpf2bpf/target_yes_callees +global_func_dead_code +global_map_resize +inner_array_lookup +irq/irq_flag_overwrite +irq/irq_flag_overwrite_partial +irq/irq_global_subprog +irq/irq_ooo_refs_array +irq/irq_restore_4_subprog +irq/irq_restore_bad_arg +irq/irq_restore_invalid +irq/irq_restore_iter +irq/irq_restore_missing_1_subprog +irq/irq_restore_missing_2 +irq/irq_restore_missing_2_subprog +irq/irq_restore_missing_3 +irq/irq_restore_missing_3_minus_2 +irq/irq_restore_missing_3_minus_2_subprog +irq/irq_restore_missing_3_subprog +irq/irq_restore_ooo +irq/irq_restore_ooo_3 +irq/irq_restore_ooo_3_subprog +irq/irq_save_bad_arg +irq/irq_save_invalid +irq/irq_save_iter +irq/irq_sleepable_helper +irq/irq_sleepable_kfunc +iters/compromise_iter_w_direct_write_and_skip_destroy_fail +iters/compromise_iter_w_direct_write_fail +iters/compromise_iter_w_helper_write_fail +iters/create_and_forget_to_destroy_fail +iters/css_task +iters/delayed_precision_mark +iters/delayed_read_mark +iters/destroy_without_creating_fail +iters/double_create_fail +iters/double_destroy_fail +iters/iter_css_lock_and_unlock +iters/iter_css_task_for_each +iters/iter_css_without_lock +iters/iter_destroy_bad_arg +iters/iter_err_too_permissive1 +iters/iter_err_too_permissive2 +iters/iter_err_too_permissive3 +iters/iter_err_unsafe_asm_loop +iters/iter_err_unsafe_c_loop +iters/iter_nested_iters +iters/iter_new_bad_arg +iters/iter_next_bad_arg +iters/iter_next_ptr_mem_not_trusted +iters/iter_next_rcu_not_trusted +iters/iter_next_rcu_or_null +iters/iter_next_trusted_or_null +iters/iter_obfuscate_counter +iters/iter_subprog_iters +iters/iter_tasks_lock_and_unlock +iters/iter_tasks_without_lock +iters/leak_iter_from_subprog_fail +iters/loop_state_deps1 +iters/loop_state_deps2 +iters/missing_null_check_fail +iters/next_after_destroy_fail +iters/next_without_new_fail +iters/read_from_iter_slot_fail +iters/stacksafe_should_not_conflate_stack_spill_and_iter +iters/testmod_seq_getter_after_bad +iters/testmod_seq_getter_before_bad +iters/wrong_sized_read_fail +jeq_infer_not_null +jit_probe_mem +kfree_skb +kfunc_call/kfunc_call_ctx +kfunc_call/kfunc_call_test1 +kfunc_call/kfunc_call_test2 +kfunc_call/kfunc_call_test4 +kfunc_call/kfunc_call_test_get_mem +kfunc_call/kfunc_call_test_ref_btf_id +kfunc_call/kfunc_call_test_static_unused_arg +kfunc_call/kfunc_syscall_test +kfunc_call/kfunc_syscall_test_null +kfunc_dynptr_param/not_ptr_to_stack +kfunc_dynptr_param/not_valid_dynptr +kfunc_param_nullable/kfunc_dynptr_nullable_test3 +kprobe_multi_test/kprobe_session_return_2 +kptr_xchg_inline +l4lb_all/l4lb_noinline +l4lb_all/l4lb_noinline_dynptr +linked_list +local_kptr_stash/drop_rb_node_off +local_kptr_stash/local_kptr_stash_local_with_root +local_kptr_stash/local_kptr_stash_plain +local_kptr_stash/local_kptr_stash_simple +local_kptr_stash/local_kptr_stash_unstash +local_kptr_stash/refcount_acquire_without_unstash +local_kptr_stash/stash_rb_nodes +log_buf/obj_load_log_buf +log_fixup/bad_core_relo_subprog +log_fixup/bad_core_relo_trunc_full +lru_bug +map_btf +map_in_map/acc_map_in_array +map_in_map/acc_map_in_htab +map_in_map/sleepable_acc_map_in_array +map_in_map/sleepable_acc_map_in_htab +map_kptr/correct_btf_id_check_size +map_kptr/inherit_untrusted_on_walk +map_kptr/kptr_xchg_possibly_null +map_kptr/kptr_xchg_ref_state +map_kptr/mark_ref_as_untrusted_or_null +map_kptr/marked_as_untrusted_or_null +map_kptr/non_const_var_off +map_kptr/non_const_var_off_kptr_xchg +map_kptr/reject_bad_type_xchg +map_kptr/reject_kptr_xchg_on_unref +map_kptr/reject_member_of_ref_xchg +map_kptr/reject_untrusted_xchg +map_kptr/success-map +map_ptr +nested_trust/test_invalid_nested_user_cpus +nested_trust/test_invalid_skb_field +percpu_alloc/array +percpu_alloc/array_sleepable +percpu_alloc/cgrp_local_storage +percpu_alloc/test_array_map_1 +percpu_alloc/test_array_map_2 +percpu_alloc/test_array_map_3 +percpu_alloc/test_array_map_4 +percpu_alloc/test_array_map_5 +percpu_alloc/test_array_map_6 +percpu_alloc/test_array_map_7 +percpu_alloc/test_array_map_8 +perf_branches/perf_branches_no_hw +pkt_access +preempt_lock/preempt_global_subprog_test +preempt_lock/preempt_lock_missing_1 +preempt_lock/preempt_lock_missing_1_subprog +preempt_lock/preempt_lock_missing_2 +preempt_lock/preempt_lock_missing_2_minus_1_subprog +preempt_lock/preempt_lock_missing_2_subprog +preempt_lock/preempt_lock_missing_3 +preempt_lock/preempt_lock_missing_3_minus_2 +preempt_lock/preempt_sleepable_helper +preempt_lock/preempt_sleepable_kfunc +preempted_bpf_ma_op +prog_run_opts +prog_tests_framework +raw_tp_null +rbtree_fail +rbtree_success +recursion +refcounted_kptr +refcounted_kptr_fail +refcounted_kptr_wrong_owner +reference_tracking/sk_lookup_success +ringbuf_multi +setget_sockopt +sk_lookup +skc_to_unix_sock +sock_addr/recvmsg4: attach prog with wrong attach type +sock_addr/recvmsg4: recvfrom (dgram) +sock_addr/recvmsg6: attach prog with wrong attach type +sock_addr/recvmsg6: recvfrom (dgram) +sock_addr/sendmsg4: attach prog with wrong attach type +sock_addr/sendmsg4: kernel_sendmsg (dgram) +sock_addr/sendmsg4: kernel_sendmsg deny (dgram) +sock_addr/sendmsg4: sendmsg (dgram) +sock_addr/sendmsg4: sendmsg deny (dgram) +sock_addr/sendmsg4: sock_sendmsg (dgram) +sock_addr/sendmsg4: sock_sendmsg deny (dgram) +sock_addr/sendmsg6: attach prog with wrong attach type +sock_addr/sendmsg6: kernel_sendmsg (dgram) +sock_addr/sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: kernel_sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg (dgram) +sock_addr/sendmsg6: sendmsg IPv4-mapped IPv6 (dgram) +sock_addr/sendmsg6: sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg (dgram) +sock_addr/sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg deny (dgram) +sock_destroy/trace_tcp_destroy_sock +sock_fields +sockmap_listen/sockhash IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 UDP test_reuseport_select_connected +spin_lock +struct_ops_module/unsupported_ops +syscall +tailcalls/classifier_0 +tailcalls/classifier_1 +tailcalls/reject_tail_call_preempt_lock +tailcalls/reject_tail_call_rcu_lock +tailcalls/reject_tail_call_ref +tailcalls/reject_tail_call_spin_lock +tailcalls/tailcall_6 +tailcalls/tailcall_bpf2bpf_2 +tailcalls/tailcall_bpf2bpf_3 +tailcalls/tailcall_bpf2bpf_fentry +tailcalls/tailcall_bpf2bpf_fentry_entry +tailcalls/tailcall_bpf2bpf_fentry_fexit +tailcalls/tailcall_bpf2bpf_fexit +tailcalls/tailcall_bpf2bpf_hierarchy_2 +tailcalls/tailcall_bpf2bpf_hierarchy_3 +task_kfunc +task_local_storage/uptr_across_pages +task_local_storage/uptr_basic +task_local_storage/uptr_kptr_xchg +task_local_storage/uptr_map_failure_e2big +task_local_storage/uptr_map_failure_kstruct +task_local_storage/uptr_map_failure_size0 +task_local_storage/uptr_no_null_check +task_local_storage/uptr_obj_new +task_local_storage/uptr_update_failure +tc_bpf/tc_bpf_non_root +tc_redirect/tc_redirect_dtime +tcp_custom_syncookie +tcp_hdr_options +test_bpf_ma +test_global_funcs/arg_tag_ctx_kprobe +test_global_funcs/arg_tag_ctx_perf +test_global_funcs/arg_tag_ctx_raw_tp +test_global_funcs/global_func1 +test_global_funcs/global_func10 +test_global_funcs/global_func11 +test_global_funcs/global_func12 +test_global_funcs/global_func13 +test_global_funcs/global_func14 +test_global_funcs/global_func15 +test_global_funcs/global_func15_tricky_pruning +test_global_funcs/global_func17 +test_global_funcs/global_func3 +test_global_funcs/global_func5 +test_global_funcs/global_func6 +test_global_funcs/global_func7 +test_lsm/lsm_basic +test_profiler +test_strncmp/strncmp_bad_not_null_term_target +timer +timer_mim +token +tp_btf_nullable/handle_tp_btf_nullable_bare1 +tunnel +uprobe_multi_test/uprobe_sesison_return_2 +user_ringbuf/user_ringbuf_callback_bad_access1 +user_ringbuf/user_ringbuf_callback_bad_access2 +user_ringbuf/user_ringbuf_callback_const_ptr_to_dynptr_reg_off +user_ringbuf/user_ringbuf_callback_discard_dynptr +user_ringbuf/user_ringbuf_callback_invalid_return +user_ringbuf/user_ringbuf_callback_null_context_read +user_ringbuf/user_ringbuf_callback_null_context_write +user_ringbuf/user_ringbuf_callback_reinit_dynptr_mem +user_ringbuf/user_ringbuf_callback_reinit_dynptr_ringbuf +user_ringbuf/user_ringbuf_callback_submit_dynptr +user_ringbuf/user_ringbuf_callback_write_forbidden +verif_scale_pyperf100 +verif_scale_pyperf180 +verif_scale_pyperf600 +verif_scale_pyperf600_nounroll +verif_scale_seg6_loop +verif_scale_strobemeta +verif_scale_strobemeta_nounroll1 +verif_scale_strobemeta_nounroll2 +verif_scale_strobemeta_subprogs +verif_scale_sysctl_loop1 +verif_scale_sysctl_loop2 +verif_scale_xdp_loop +verifier_and/invalid_and_of_negative_number +verifier_and/invalid_range_check +verifier_arena/iter_maps2 +verifier_arena/iter_maps3 +verifier_array_access/a_read_only_array_1_2 +verifier_array_access/a_read_only_array_2_2 +verifier_array_access/a_write_only_array_1_2 +verifier_array_access/a_write_only_array_2_2 +verifier_array_access/an_array_with_a_constant_2 +verifier_array_access/an_array_with_a_register_2 +verifier_array_access/an_array_with_a_variable_2 +verifier_array_access/array_with_no_floor_check +verifier_array_access/with_a_invalid_max_check_1 +verifier_array_access/with_a_invalid_max_check_2 +verifier_basic_stack/invalid_fp_arithmetic +verifier_basic_stack/misaligned_read_from_stack +verifier_basic_stack/stack_out_of_bounds +verifier_bitfield_write +verifier_bits_iter/destroy_uninit +verifier_bits_iter/next_uninit +verifier_bits_iter/no_destroy +verifier_bounds/bounds_map_value_variant_1 +verifier_bounds/bounds_map_value_variant_2 +verifier_bounds/of_boundary_crossing_range_1 +verifier_bounds/of_boundary_crossing_range_2 +verifier_bounds/on_sign_extended_mov_test1 +verifier_bounds/on_sign_extended_mov_test2 +verifier_bounds/reg32_any_reg32_xor_3 +verifier_bounds/reg_any_reg_xor_3 +verifier_bounds/shift_of_maybe_negative_number +verifier_bounds/shift_with_64_bit_input +verifier_bounds/shift_with_oversized_count_operand +verifier_bounds/size_signed_32bit_overflow_test1 +verifier_bounds/size_signed_32bit_overflow_test2 +verifier_bounds/size_signed_32bit_overflow_test3 +verifier_bounds/size_signed_32bit_overflow_test4 +verifier_bounds/var_off_insn_off_test1 +verifier_bounds/var_off_insn_off_test2 +verifier_bounds_deduction/deducing_bounds_from_const_1 +verifier_bounds_deduction/deducing_bounds_from_const_10 +verifier_bounds_deduction/deducing_bounds_from_const_3 +verifier_bounds_deduction/deducing_bounds_from_const_5 +verifier_bounds_deduction/deducing_bounds_from_const_6 +verifier_bounds_deduction/deducing_bounds_from_const_7 +verifier_bounds_deduction/deducing_bounds_from_const_8 +verifier_bounds_deduction/deducing_bounds_from_const_9 +verifier_bounds_mix_sign_unsign/checks_mixing_signed_and_unsigned +verifier_bounds_mix_sign_unsign/signed_and_unsigned_positive_bounds +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_10 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_11 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_12 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_13 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_14 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_15 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_2 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_3 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_5 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_6 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_8 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_16 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_32 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_8 +verifier_cfg/conditional_loop +verifier_cfg/loop2_back_edge +verifier_cfg/loop_back_edge +verifier_cfg/out_of_range_jump +verifier_cfg/out_of_range_jump2 +verifier_cfg/uncond_loop_after_cond_jmp +verifier_cfg/uncond_loop_in_subprog_after_cond_jmp +verifier_cfg/unreachable +verifier_cfg/unreachable2 +verifier_cgroup_inv_retcode/with_invalid_return_code_test1 +verifier_cgroup_inv_retcode/with_invalid_return_code_test3 +verifier_cgroup_inv_retcode/with_invalid_return_code_test5 +verifier_cgroup_inv_retcode/with_invalid_return_code_test6 +verifier_cgroup_inv_retcode/with_invalid_return_code_test7 +verifier_cgroup_skb/data_meta_for_cgroup_skb +verifier_cgroup_skb/flow_keys_for_cgroup_skb +verifier_cgroup_skb/napi_id_for_cgroup_skb +verifier_cgroup_skb/tc_classid_for_cgroup_skb +verifier_cgroup_storage/cpu_cgroup_storage_access_1 +verifier_cgroup_storage/cpu_cgroup_storage_access_2 +verifier_cgroup_storage/cpu_cgroup_storage_access_3 +verifier_cgroup_storage/cpu_cgroup_storage_access_4 +verifier_cgroup_storage/cpu_cgroup_storage_access_5 +verifier_cgroup_storage/cpu_cgroup_storage_access_6 +verifier_cgroup_storage/invalid_cgroup_storage_access_1 +verifier_cgroup_storage/invalid_cgroup_storage_access_2 +verifier_cgroup_storage/invalid_cgroup_storage_access_3 +verifier_cgroup_storage/invalid_cgroup_storage_access_4 +verifier_cgroup_storage/invalid_cgroup_storage_access_5 +verifier_cgroup_storage/invalid_cgroup_storage_access_6 +verifier_const/bprm +verifier_const/tcx1 +verifier_const/tcx4 +verifier_const/tcx7 +verifier_const_or/not_bypass_stack_boundary_checks_1 +verifier_const_or/not_bypass_stack_boundary_checks_2 +verifier_ctx/context_stores_via_bpf_atomic +verifier_ctx/ctx_pointer_to_helper_1 +verifier_ctx/ctx_pointer_to_helper_2 +verifier_ctx/ctx_pointer_to_helper_3 +verifier_ctx/make_ptr_to_ctx_unusable +verifier_ctx/null_check_4_ctx_const +verifier_ctx/null_check_8_null_bind +verifier_ctx/or_null_check_3_1 +verifier_ctx_sk_msg/of_size_in_sk_msg +verifier_ctx_sk_msg/past_end_of_sk_msg +verifier_ctx_sk_msg/read_offset_in_sk_msg +verifier_d_path/d_path_reject +verifier_direct_packet_access/access_test15_spill_with_xadd +verifier_direct_packet_access/direct_packet_access_test3 +verifier_direct_packet_access/id_in_regsafe_bad_access +verifier_direct_packet_access/packet_access_test10_write_invalid +verifier_direct_packet_access/pkt_end_reg_bad_access +verifier_direct_packet_access/pkt_end_reg_both_accesses +verifier_direct_packet_access/test16_arith_on_data_end +verifier_direct_packet_access/test23_x_pkt_ptr_4 +verifier_direct_packet_access/test26_marking_on_bad_access +verifier_direct_packet_access/test28_marking_on_bad_access +verifier_direct_stack_access_wraparound +verifier_global_ptr_args +verifier_global_subprogs +verifier_helper_access_var_len/bitwise_and_jmp_wrong_max +verifier_helper_access_var_len/jmp_signed_no_min_check +verifier_helper_access_var_len/map_adjusted_jmp_wrong_max +verifier_helper_access_var_len/memory_map_jmp_wrong_max +verifier_helper_access_var_len/memory_stack_jmp_bounds_offset +verifier_helper_access_var_len/memory_stack_jmp_wrong_max +verifier_helper_access_var_len/ptr_to_mem_or_null_2 +verifier_helper_access_var_len/ptr_to_mem_or_null_8 +verifier_helper_access_var_len/ptr_to_mem_or_null_9 +verifier_helper_access_var_len/stack_jmp_no_max_check +verifier_helper_packet_access/cls_helper_fail_range_1 +verifier_helper_packet_access/cls_helper_fail_range_2 +verifier_helper_packet_access/cls_helper_fail_range_3 +verifier_helper_packet_access/packet_ptr_with_bad_range_1 +verifier_helper_packet_access/packet_ptr_with_bad_range_2 +verifier_helper_packet_access/packet_test2_unchecked_packet_ptr +verifier_helper_packet_access/ptr_with_too_short_range_1 +verifier_helper_packet_access/ptr_with_too_short_range_2 +verifier_helper_packet_access/test11_cls_unsuitable_helper_1 +verifier_helper_packet_access/test12_cls_unsuitable_helper_2 +verifier_helper_packet_access/test15_cls_helper_fail_sub +verifier_helper_packet_access/test20_pkt_end_as_input +verifier_helper_packet_access/test7_cls_unchecked_packet_ptr +verifier_helper_packet_access/to_packet_test21_wrong_reg +verifier_helper_restricted +verifier_helper_value_access/access_to_map_empty_range +verifier_helper_value_access/access_to_map_negative_range +verifier_helper_value_access/access_to_map_possibly_empty_range +verifier_helper_value_access/access_to_map_wrong_size +verifier_helper_value_access/bounds_check_using_bad_access_1 +verifier_helper_value_access/bounds_check_using_bad_access_2 +verifier_helper_value_access/check_using_s_bad_access_1 +verifier_helper_value_access/check_using_s_bad_access_2 +verifier_helper_value_access/const_imm_negative_range_adjustment_1 +verifier_helper_value_access/const_imm_negative_range_adjustment_2 +verifier_helper_value_access/const_reg_negative_range_adjustment_1 +verifier_helper_value_access/const_reg_negative_range_adjustment_2 +verifier_helper_value_access/imm_out_of_bound_1 +verifier_helper_value_access/imm_out_of_bound_2 +verifier_helper_value_access/imm_out_of_bound_range +verifier_helper_value_access/map_out_of_bound_range +verifier_helper_value_access/map_via_variable_empty_range +verifier_helper_value_access/reg_out_of_bound_1 +verifier_helper_value_access/reg_out_of_bound_2 +verifier_helper_value_access/reg_out_of_bound_range +verifier_helper_value_access/via_const_imm_empty_range +verifier_helper_value_access/via_const_reg_empty_range +verifier_helper_value_access/via_variable_no_max_check_1 +verifier_helper_value_access/via_variable_no_max_check_2 +verifier_helper_value_access/via_variable_wrong_max_check_1 +verifier_helper_value_access/via_variable_wrong_max_check_2 +verifier_int_ptr/arg_ptr_to_long_misaligned +verifier_int_ptr/to_long_size_sizeof_long +verifier_iterating_callbacks/bpf_loop_iter_limit_overflow +verifier_iterating_callbacks/check_add_const_3regs +verifier_iterating_callbacks/check_add_const_3regs_2if +verifier_iterating_callbacks/check_add_const_regsafe_off +verifier_iterating_callbacks/iter_limit_bug +verifier_iterating_callbacks/jgt_imm64_and_may_goto +verifier_iterating_callbacks/loop_detection +verifier_iterating_callbacks/may_goto_self +verifier_iterating_callbacks/unsafe_find_vma +verifier_iterating_callbacks/unsafe_for_each_map_elem +verifier_iterating_callbacks/unsafe_on_2nd_iter +verifier_iterating_callbacks/unsafe_on_zero_iter +verifier_iterating_callbacks/unsafe_ringbuf_drain +verifier_jeq_infer_not_null/unchanged_for_jeq_false_branch +verifier_jeq_infer_not_null/unchanged_for_jne_true_branch +verifier_kfunc_prog_types/cgrp_kfunc_raw_tp +verifier_kfunc_prog_types/cpumask_kfunc_raw_tp +verifier_kfunc_prog_types/task_kfunc_raw_tp +verifier_ld_ind/ind_check_calling_conv_r1 +verifier_ld_ind/ind_check_calling_conv_r2 +verifier_ld_ind/ind_check_calling_conv_r3 +verifier_ld_ind/ind_check_calling_conv_r4 +verifier_ld_ind/ind_check_calling_conv_r5 +verifier_leak_ptr/leak_pointer_into_ctx_1 +verifier_leak_ptr/leak_pointer_into_ctx_2 +verifier_linked_scalars +verifier_loops1/bounded_recursion +verifier_loops1/infinite_loop_in_two_jumps +verifier_loops1/infinite_loop_three_jump_trick +verifier_loops1/loop_after_a_conditional_jump +verifier_lsm/bool_retval_test3 +verifier_lsm/bool_retval_test4 +verifier_lsm/disabled_hook_test1 +verifier_lsm/disabled_hook_test2 +verifier_lsm/disabled_hook_test3 +verifier_lsm/errno_zero_retval_test4 +verifier_lsm/errno_zero_retval_test5 +verifier_lsm/errno_zero_retval_test6 +verifier_lwt/not_permitted_for_lwt_prog +verifier_lwt/packet_write_for_lwt_in +verifier_lwt/packet_write_for_lwt_out +verifier_lwt/tc_classid_for_lwt_in +verifier_lwt/tc_classid_for_lwt_out +verifier_lwt/tc_classid_for_lwt_xmit +verifier_map_in_map/invalid_inner_map_pointer +verifier_map_in_map/on_the_inner_map_pointer +verifier_map_ptr/bpf_map_ptr_write_rejected +verifier_map_ptr/read_non_existent_field_rejected +verifier_map_ptr/read_with_negative_offset_rejected +verifier_map_ptr_mixing +verifier_map_ret_val +verifier_meta_access/meta_access_test10 +verifier_meta_access/meta_access_test2 +verifier_meta_access/meta_access_test3 +verifier_meta_access/meta_access_test4 +verifier_meta_access/meta_access_test5 +verifier_meta_access/meta_access_test6 +verifier_meta_access/meta_access_test9 +verifier_netfilter_ctx/with_invalid_ctx_access_test1 +verifier_netfilter_ctx/with_invalid_ctx_access_test2 +verifier_netfilter_ctx/with_invalid_ctx_access_test3 +verifier_netfilter_ctx/with_invalid_ctx_access_test4 +verifier_netfilter_ctx/with_invalid_ctx_access_test5 +verifier_netfilter_retcode/with_invalid_return_code_test1 +verifier_netfilter_retcode/with_invalid_return_code_test4 +verifier_or_jmp32_k +verifier_prevent_map_lookup +verifier_raw_stack/bytes_spilled_regs_corruption_2 +verifier_raw_stack/load_bytes_invalid_access_1 +verifier_raw_stack/load_bytes_invalid_access_2 +verifier_raw_stack/load_bytes_invalid_access_3 +verifier_raw_stack/load_bytes_invalid_access_4 +verifier_raw_stack/load_bytes_invalid_access_5 +verifier_raw_stack/load_bytes_invalid_access_6 +verifier_raw_stack/load_bytes_negative_len_2 +verifier_raw_stack/load_bytes_spilled_regs_corruption +verifier_raw_stack/skb_load_bytes_negative_len +verifier_raw_stack/skb_load_bytes_zero_len +verifier_raw_tp_writable +verifier_ref_tracking +verifier_reg_equal/subreg_equality_2 +verifier_regalloc/regalloc_and_spill_negative +verifier_regalloc/regalloc_negative +verifier_regalloc/regalloc_src_reg_negative +verifier_ringbuf/ringbuf_invalid_reservation_offset_1 +verifier_ringbuf/ringbuf_invalid_reservation_offset_2 +verifier_runtime_jit +verifier_scalar_ids/check_ids_in_regsafe +verifier_scalar_ids/check_ids_in_regsafe_2 +verifier_scalar_ids/linked_regs_broken_link_2 +verifier_search_pruning/for_u32_spills_u64_fill +verifier_search_pruning/liveness_pruning_and_write_screening +verifier_search_pruning/short_loop1 +verifier_search_pruning/should_be_verified_nop_operation +verifier_search_pruning/tracking_for_u32_spill_fill +verifier_search_pruning/varlen_map_value_access_pruning +verifier_sock/bpf_sk_fullsock_skb_sk +verifier_sock/bpf_sk_release_skb_sk +verifier_sock/bpf_tcp_sock_skb_sk +verifier_sock/dst_port_byte_load_invalid +verifier_sock/dst_port_half_load_invalid_1 +verifier_sock/dst_port_half_load_invalid_2 +verifier_sock/invalidate_pkt_pointers_by_tail_call +verifier_sock/invalidate_pkt_pointers_from_global_func +verifier_sock/map_lookup_elem_smap_key +verifier_sock/map_lookup_elem_sockhash_key +verifier_sock/map_lookup_elem_sockmap_key +verifier_sock/no_null_check_on_ret_1 +verifier_sock/no_null_check_on_ret_2 +verifier_sock/of_bpf_skc_to_helpers +verifier_sock/post_bind4_read_mark +verifier_sock/post_bind4_read_src_ip6 +verifier_sock/post_bind6_read_src_ip4 +verifier_sock/sk_1_1_value_1 +verifier_sock/sk_no_skb_sk_check_1 +verifier_sock/sk_no_skb_sk_check_2 +verifier_sock/sk_sk_type_fullsock_field_1 +verifier_sock/skb_sk_beyond_last_field_1 +verifier_sock/skb_sk_beyond_last_field_2 +verifier_sock/skb_sk_no_null_check +verifier_sock/sock_create_read_src_port +verifier_sock_addr/bind4_bad_return_code +verifier_sock_addr/bind6_bad_return_code +verifier_sock_addr/connect4_bad_return_code +verifier_sock_addr/connect6_bad_return_code +verifier_sock_addr/connect_unix_bad_return_code +verifier_sock_addr/getpeername4_bad_return_code +verifier_sock_addr/getpeername6_bad_return_code +verifier_sock_addr/getpeername_unix_bad_return_code +verifier_sock_addr/getsockname4_bad_return_code +verifier_sock_addr/getsockname6_bad_return_code +verifier_sock_addr/getsockname_unix_unix_bad_return_code +verifier_sock_addr/recvmsg4_bad_return_code +verifier_sock_addr/recvmsg6_bad_return_code +verifier_sock_addr/recvmsg_unix_bad_return_code +verifier_sock_addr/sendmsg4_bad_return_code +verifier_sock_addr/sendmsg6_bad_return_code +verifier_sock_addr/sendmsg_unix_bad_return_code +verifier_sockmap_mutate/test_flow_dissector_update +verifier_sockmap_mutate/test_raw_tp_delete +verifier_sockmap_mutate/test_raw_tp_update +verifier_sockmap_mutate/test_sockops_update +verifier_spill_fill/_6_offset_to_skb_data +verifier_spill_fill/addr_offset_to_skb_data +verifier_spill_fill/check_corrupted_spill_fill +verifier_spill_fill/fill_32bit_after_spill_64bit_clear_id +verifier_spill_fill/spill_16bit_of_32bit_fail +verifier_spill_fill/spill_32bit_of_64bit_fail +verifier_spill_fill/u64_offset_to_skb_data +verifier_spill_fill/with_invalid_reg_offset_0 +verifier_spin_lock/call_within_a_locked_region +verifier_spin_lock/lock_test2_direct_ld_st +verifier_spin_lock/lock_test3_direct_ld_st +verifier_spin_lock/lock_test4_direct_ld_st +verifier_spin_lock/lock_test7_unlock_without_lock +verifier_spin_lock/reg_id_for_map_value +verifier_spin_lock/spin_lock_test6_missing_unlock +verifier_spin_lock/spin_lock_test8_double_lock +verifier_spin_lock/spin_lock_test9_different_lock +verifier_spin_lock/test11_ld_abs_under_lock +verifier_stack_ptr/load_bad_alignment_on_off +verifier_stack_ptr/load_bad_alignment_on_reg +verifier_stack_ptr/load_out_of_bounds_high +verifier_stack_ptr/load_out_of_bounds_low +verifier_stack_ptr/to_stack_check_high_4 +verifier_stack_ptr/to_stack_check_high_5 +verifier_stack_ptr/to_stack_check_high_6 +verifier_stack_ptr/to_stack_check_high_7 +verifier_stack_ptr/to_stack_check_low_3 +verifier_stack_ptr/to_stack_check_low_4 +verifier_stack_ptr/to_stack_check_low_5 +verifier_stack_ptr/to_stack_check_low_6 +verifier_stack_ptr/to_stack_check_low_7 +verifier_subprog_precision/callback_precise_return_fail +verifier_tailcall_jit +verifier_uninit +verifier_unpriv +verifier_unpriv_perf +verifier_value/store_of_cleared_call_register +verifier_value_illegal_alu +verifier_value_or_null/map_access_from_else_condition +verifier_value_or_null/map_value_or_null_1 +verifier_value_or_null/map_value_or_null_2 +verifier_value_or_null/map_value_or_null_3 +verifier_value_or_null/multiple_map_lookup_elem_calls +verifier_value_or_null/null_check_ids_in_regsafe +verifier_value_ptr_arith/access_known_scalar_value_ptr_2 +verifier_value_ptr_arith/access_unknown_scalar_value_ptr +verifier_value_ptr_arith/access_value_ptr_known_scalar +verifier_value_ptr_arith/access_value_ptr_unknown_scalar +verifier_value_ptr_arith/access_value_ptr_value_ptr_1 +verifier_value_ptr_arith/access_value_ptr_value_ptr_2 +verifier_value_ptr_arith/lower_oob_arith_test_1 +verifier_value_ptr_arith/to_leak_tainted_dst_reg +verifier_value_ptr_arith/unknown_scalar_value_ptr_4 +verifier_value_ptr_arith/value_ptr_known_scalar_2_1 +verifier_value_ptr_arith/value_ptr_known_scalar_3 +verifier_var_off/access_max_out_of_bound +verifier_var_off/access_min_out_of_bound +verifier_var_off/stack_write_clobbers_spilled_regs +verifier_var_off/variable_offset_ctx_access +verifier_var_off/variable_offset_stack_access_unbounded +verifier_var_off/zero_sized_access_max_out_of_bound +verifier_vfs_reject +verifier_xadd/xadd_w_check_unaligned_map +verifier_xadd/xadd_w_check_unaligned_pkt +verifier_xadd/xadd_w_check_unaligned_stack +verifier_xdp_direct_packet_access/corner_case_1_bad_access_1 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_10 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_11 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_12 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_13 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_14 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_15 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_16 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_2 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_3 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_4 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_5 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_6 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_7 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_8 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_9 +verifier_xdp_direct_packet_access/end_mangling_bad_access_1 +verifier_xdp_direct_packet_access/end_mangling_bad_access_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_5 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_6 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_7 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_8 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_4 +verify_pkcs7_sig +xdp_synproxy diff --git a/ci/vmtest/configs/DENYLIST.x86_64 b/ci/vmtest/configs/DENYLIST.x86_64 new file mode 100644 index 0000000000000..6fc3413daab9f --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.x86_64 @@ -0,0 +1 @@ +netcnt # with kvm enabled, fail with packets unexpected packets: actual 10001 != expected 10000 diff --git a/ci/vmtest/configs/run-vmtest.env b/ci/vmtest/configs/run-vmtest.env new file mode 100644 index 0000000000000..c60f1db6673c7 --- /dev/null +++ b/ci/vmtest/configs/run-vmtest.env @@ -0,0 +1,42 @@ +#!/bin/bash + +# This file is sourced by libbpf/ci/run-vmtest Github Action scripts. +# +# The primary reason it exists is that assembling ALLOWLIST and +# DENYLIST for a particular test run is not a trivial operation. +# +# Users of libbpf/ci/run-vmtest action need to be able to specify a +# list of allow/denylist **files**, that later has to be correctly +# merged into a single allow/denylist passed to a test runner. +# +# Obviously it's perferrable for the scripts merging many lists into +# one to be reusable, and not copy-pasted between repositories which +# use libbpf/ci actions. And specifying the lists should be trivial. +# This file is a solution to that. + +# $SELFTESTS_BPF and $VMTEST_CONFIGS are set in the workflow, before +# libbpf/ci/run-vmtest action is called +# See .github/workflows/kernel-test.yml + +ALLOWLIST_FILES=( + "${SELFTESTS_BPF}/ALLOWLIST" + "${SELFTESTS_BPF}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST" + "${VMTEST_CONFIGS}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/ALLOWLIST.${KERNEL_TEST}" +) + +DENYLIST_FILES=( + "${SELFTESTS_BPF}/DENYLIST" + "${SELFTESTS_BPF}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST" + "${VMTEST_CONFIGS}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/DENYLIST.${KERNEL_TEST}" +) + +# Export pipe-separated strings, because bash doesn't support array export +export SELFTESTS_BPF_ALLOWLIST_FILES=$(IFS="|"; echo "${ALLOWLIST_FILES[*]}") +export SELFTESTS_BPF_DENYLIST_FILES=$(IFS="|"; echo "${DENYLIST_FILES[*]}") + diff --git a/ci/vmtest/configs/run_veristat.kernel.cfg b/ci/vmtest/configs/run_veristat.kernel.cfg new file mode 100644 index 0000000000000..807efc251073f --- /dev/null +++ b/ci/vmtest/configs/run_veristat.kernel.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${SELFTESTS_BPF}" +VERISTAT_OBJECTS_GLOB="*.bpf.o" +VERISTAT_CFG_FILE="${SELFTESTS_BPF}/veristat.cfg" +VERISTAT_OUTPUT="veristat-kernel" diff --git a/ci/vmtest/configs/run_veristat.meta.cfg b/ci/vmtest/configs/run_veristat.meta.cfg new file mode 100644 index 0000000000000..14f08d241d206 --- /dev/null +++ b/ci/vmtest/configs/run_veristat.meta.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${WORKING_DIR}/bpf_objects" +VERISTAT_OBJECTS_GLOB="*.o" +VERISTAT_OUTPUT="veristat-meta" +VERISTAT_CFG_FILE="${VERISTAT_CONFIGS}/veristat_meta.cfg" diff --git a/ci/vmtest/configs/veristat_meta.cfg b/ci/vmtest/configs/veristat_meta.cfg new file mode 100644 index 0000000000000..a8c25d71cb9e2 --- /dev/null +++ b/ci/vmtest/configs/veristat_meta.cfg @@ -0,0 +1,10 @@ +# List of exceptions we know about that are not going to work with veristat. + +# needs 'migrate_misplaced_page' which went away in +# commit 73eab3ca481e ("mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio()") +!numamove_bpf-numamove_bpf.o + +# use non-libbpf loader +!takeover_bpf_lib-takeover.bpf.o +!tcp_tuner_bpf_lib-tcptuner.bpf.o + diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 0a725e46d017b..53816dfab6452 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -429,13 +430,23 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, { struct acpi_cedt_cfmws *cfmws; int *fake_pxm = arg; - u64 start, end; + u64 start, end, align; int node; + int err; cfmws = (struct acpi_cedt_cfmws *)header; start = cfmws->base_hpa; end = cfmws->base_hpa + cfmws->window_size; + /* Align memblock size to CFMW regions if possible */ + align = 1UL << __ffs(start | end); + if (align >= SZ_256M) { + err = memory_block_advise_max_size(align); + if (err) + pr_warn("CFMWS: memblock size advise failed (%d)\n", err); + } else + pr_err("CFMWS: [BIOS BUG] base/size alignment violates spec\n"); + /* * The SRAT may have already described NUMA details for all, * or a portion of, this CFMWS HPA range. Extend the memblks @@ -453,7 +464,7 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, return -EINVAL; } - if (numa_add_memblk(node, start, end) < 0) { + if (numa_add_reserved_memblk(node, start, end) < 0) { /* CXL driver must handle the NUMA_NO_NODE case */ pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n", node, start, end); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8f3a41d9bfaa2..6ed72c2580368 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -110,6 +110,57 @@ static void memory_block_release(struct device *dev) kfree(mem); } + +/* Max block size to be set by memory_block_advise_max_size */ +static unsigned long memory_block_advised_size; +static bool memory_block_advised_size_queried; + +/** + * memory_block_advise_max_size() - advise memory hotplug on the max suggested + * block size, usually for alignment. + * @size: suggestion for maximum block size. must be aligned on power of 2. + * + * Early boot software (pre-allocator init) may advise archs on the max block + * size. This value can only decrease after initialization, as the intent is + * to identify the largest supported alignment for all sources. + * + * Use of this value is arch-defined, as is min/max block size. + * + * Return: 0 on success + * -EINVAL if size is 0 or not pow2 aligned + * -EBUSY if value has already been probed + */ +int __init memory_block_advise_max_size(unsigned long size) +{ + if (!size || !is_power_of_2(size)) + return -EINVAL; + + if (memory_block_advised_size_queried) + return -EBUSY; + + if (memory_block_advised_size) + memory_block_advised_size = min(memory_block_advised_size, size); + else + memory_block_advised_size = size; + + return 0; +} + +/** + * memory_block_advised_max_size() - query advised max hotplug block size. + * + * After the first call, the value can never change. Callers looking for the + * actual block size should use memory_block_size_bytes. This interface is + * intended for use by arch-init when initializing the hotplug block size. + * + * Return: advised size in bytes, or 0 if never set. + */ +unsigned long memory_block_advised_max_size(void) +{ + memory_block_advised_size_queried = true; + return memory_block_advised_size; +} + unsigned long __weak memory_block_size_bytes(void) { return MIN_MEMORY_BLOCK_SIZE; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fda7d8624889f..94e6e9b80bf07 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -734,114 +734,19 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -#define PAGE_WB_SIG "page_index=" - -#define PAGE_WRITEBACK 0 -#define HUGE_WRITEBACK (1<<0) -#define IDLE_WRITEBACK (1<<1) -#define INCOMPRESSIBLE_WRITEBACK (1<<2) - -static int scan_slots_for_writeback(struct zram *zram, u32 mode, - unsigned long nr_pages, - unsigned long index, - struct zram_pp_ctl *ctl) +static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) { - for (; nr_pages != 0; index++, nr_pages--) { - bool ok = true; - - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) - goto next; - - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME)) - goto next; - - if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) - goto next; - if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; - if (mode & INCOMPRESSIBLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - goto next; - - ok = place_pp_slot(zram, ctl, index); -next: - zram_slot_unlock(zram, index); - if (!ok) - break; - } - - return 0; -} - -static ssize_t writeback_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - struct zram *zram = dev_to_zram(dev); - unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; - struct zram_pp_ctl *ctl = NULL; + unsigned long blk_idx = 0; + struct page *page = NULL; struct zram_pp_slot *pps; - unsigned long index = 0; - struct bio bio; struct bio_vec bio_vec; - struct page *page = NULL; - ssize_t ret = len; - int mode, err; - unsigned long blk_idx = 0; - - if (sysfs_streq(buf, "idle")) - mode = IDLE_WRITEBACK; - else if (sysfs_streq(buf, "huge")) - mode = HUGE_WRITEBACK; - else if (sysfs_streq(buf, "huge_idle")) - mode = IDLE_WRITEBACK | HUGE_WRITEBACK; - else if (sysfs_streq(buf, "incompressible")) - mode = INCOMPRESSIBLE_WRITEBACK; - else { - if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) - return -EINVAL; - - if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || - index >= nr_pages) - return -EINVAL; - - nr_pages = 1; - mode = PAGE_WRITEBACK; - } - - down_read(&zram->init_lock); - if (!init_done(zram)) { - ret = -EINVAL; - goto release_init_lock; - } - - /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) { - up_read(&zram->init_lock); - return -EAGAIN; - } - - if (!zram->backing_dev) { - ret = -ENODEV; - goto release_init_lock; - } + struct bio bio; + int ret = 0, err; + u32 index; page = alloc_page(GFP_KERNEL); - if (!page) { - ret = -ENOMEM; - goto release_init_lock; - } - - ctl = init_pp_ctl(); - if (!ctl) { - ret = -ENOMEM; - goto release_init_lock; - } - - scan_slots_for_writeback(zram, mode, nr_pages, index, ctl); + if (!page) + return -ENOMEM; while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); @@ -929,10 +834,215 @@ static ssize_t writeback_store(struct device *dev, if (blk_idx) free_block_bdev(zram, blk_idx); - -release_init_lock: if (page) __free_page(page); + + return ret; +} + +#define PAGE_WRITEBACK 0 +#define HUGE_WRITEBACK (1 << 0) +#define IDLE_WRITEBACK (1 << 1) +#define INCOMPRESSIBLE_WRITEBACK (1 << 2) + +static int parse_page_index(char *val, unsigned long nr_pages, + unsigned long *lo, unsigned long *hi) +{ + int ret; + + ret = kstrtoul(val, 10, lo); + if (ret) + return ret; + if (*lo >= nr_pages) + return -ERANGE; + *hi = *lo + 1; + return 0; +} + +static int parse_page_indexes(char *val, unsigned long nr_pages, + unsigned long *lo, unsigned long *hi) +{ + char *delim; + int ret; + + delim = strchr(val, '-'); + if (!delim) + return -EINVAL; + + *delim = 0x00; + ret = kstrtoul(val, 10, lo); + if (ret) + return ret; + if (*lo >= nr_pages) + return -ERANGE; + + ret = kstrtoul(delim + 1, 10, hi); + if (ret) + return ret; + if (*hi >= nr_pages || *lo > *hi) + return -ERANGE; + *hi += 1; + return 0; +} + +static int parse_mode(char *val, u32 *mode) +{ + *mode = 0; + + if (!strcmp(val, "idle")) + *mode = IDLE_WRITEBACK; + if (!strcmp(val, "huge")) + *mode = HUGE_WRITEBACK; + if (!strcmp(val, "huge_idle")) + *mode = IDLE_WRITEBACK | HUGE_WRITEBACK; + if (!strcmp(val, "incompressible")) + *mode = INCOMPRESSIBLE_WRITEBACK; + + if (*mode == 0) + return -EINVAL; + return 0; +} + +static int scan_slots_for_writeback(struct zram *zram, u32 mode, + unsigned long lo, unsigned long hi, + struct zram_pp_ctl *ctl) +{ + u32 index = lo; + + while (index < hi) { + bool ok = true; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) + goto next; + + if (mode & IDLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + if (mode & HUGE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + if (mode & INCOMPRESSIBLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + ok = place_pp_slot(zram, ctl, index); +next: + zram_slot_unlock(zram, index); + if (!ok) + break; + index++; + } + + return 0; +} + +static ssize_t writeback_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u64 nr_pages = zram->disksize >> PAGE_SHIFT; + unsigned long lo = 0, hi = nr_pages; + struct zram_pp_ctl *ctl = NULL; + char *args, *param, *val; + ssize_t ret = len; + int err, mode = 0; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + + if (!zram->backing_dev) { + ret = -ENODEV; + goto release_init_lock; + } + + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + /* + * Workaround to support the old writeback interface. + * + * The old writeback interface has a minor inconsistency and + * requires key=value only for page_index parameter, while the + * writeback mode is a valueless parameter. + * + * This is not the case anymore and now all parameters are + * required to have values, however, we need to support the + * legacy writeback interface format so we check if we can + * recognize a valueless parameter as the (legacy) writeback + * mode. + */ + if (!val || !*val) { + err = parse_mode(param, &mode); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + break; + } + + if (!strcmp(param, "type")) { + err = parse_mode(val, &mode); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + break; + } + + if (!strcmp(param, "page_index")) { + err = parse_page_index(val, nr_pages, &lo, &hi); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + continue; + } + + if (!strcmp(param, "page_indexes")) { + err = parse_page_indexes(val, nr_pages, &lo, &hi); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + continue; + } + } + + err = zram_writeback_slots(zram, ctl); + if (err) + ret = err; + +release_init_lock: release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); @@ -1694,7 +1804,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, */ handle = zs_malloc(zram->mem_pool, PAGE_SIZE, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); @@ -1761,7 +1871,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle); @@ -1981,10 +2091,15 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, * We are holding per-CPU stream mutex and entry lock so better * avoid direct reclaim. Allocation error is not fatal since * we still have the old object in the mem_pool. + * + * XXX: technically, the node we really want here is the node that holds + * the original compressed data. But that would require us to modify + * zsmalloc API to return this information. For now, we will make do with + * the node of the page allocated for recompression. */ handle_new = zs_malloc(zram->mem_pool, comp_len_new, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle_new)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle_new); diff --git a/drivers/fpga/tests/fpga-bridge-test.c b/drivers/fpga/tests/fpga-bridge-test.c index b9ab29809e961..124ba40e32b18 100644 --- a/drivers/fpga/tests/fpga-bridge-test.c +++ b/drivers/fpga/tests/fpga-bridge-test.c @@ -170,4 +170,5 @@ static struct kunit_suite fpga_bridge_suite = { kunit_test_suite(fpga_bridge_suite); +MODULE_DESCRIPTION("KUnit test for the FPGA Bridge"); MODULE_LICENSE("GPL"); diff --git a/drivers/fpga/tests/fpga-mgr-test.c b/drivers/fpga/tests/fpga-mgr-test.c index 9cb37aefbac4b..8748babb05045 100644 --- a/drivers/fpga/tests/fpga-mgr-test.c +++ b/drivers/fpga/tests/fpga-mgr-test.c @@ -330,4 +330,5 @@ static struct kunit_suite fpga_mgr_suite = { kunit_test_suite(fpga_mgr_suite); +MODULE_DESCRIPTION("KUnit test for the FPGA Manager"); MODULE_LICENSE("GPL"); diff --git a/drivers/fpga/tests/fpga-region-test.c b/drivers/fpga/tests/fpga-region-test.c index 6a108cafded86..020ceac48509f 100644 --- a/drivers/fpga/tests/fpga-region-test.c +++ b/drivers/fpga/tests/fpga-region-test.c @@ -214,4 +214,5 @@ static struct kunit_suite fpga_region_suite = { kunit_test_suite(fpga_region_suite); +MODULE_DESCRIPTION("KUnit test for the FPGA Region"); MODULE_LICENSE("GPL"); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index aedd0e2dcd890..0edd639898a63 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -25,6 +25,7 @@ #include #include #include +#include #include /* for COMMAND_LINE_SIZE */ #include @@ -875,6 +876,36 @@ void __init early_init_dt_check_for_usable_mem_range(void) memblock_add(rgn[i].base, rgn[i].size); } +/** + * early_init_dt_check_kho - Decode info required for kexec handover from DT + */ +static void __init early_init_dt_check_kho(void) +{ + unsigned long node = chosen_node_offset; + u64 fdt_start, fdt_size, scratch_start, scratch_size; + const __be32 *p; + int l; + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER) || (long)node < 0) + return; + + p = of_get_flat_dt_prop(node, "linux,kho-fdt", &l); + if (l != (dt_root_addr_cells + dt_root_size_cells) * sizeof(__be32)) + return; + + fdt_start = dt_mem_next_cell(dt_root_addr_cells, &p); + fdt_size = dt_mem_next_cell(dt_root_addr_cells, &p); + + p = of_get_flat_dt_prop(node, "linux,kho-scratch", &l); + if (l != (dt_root_addr_cells + dt_root_size_cells) * sizeof(__be32)) + return; + + scratch_start = dt_mem_next_cell(dt_root_addr_cells, &p); + scratch_size = dt_mem_next_cell(dt_root_addr_cells, &p); + + kho_populate(fdt_start, fdt_size, scratch_start, scratch_size); +} + #ifdef CONFIG_SERIAL_EARLYCON int __init early_init_dt_scan_chosen_stdout(void) @@ -1169,6 +1200,9 @@ void __init early_init_dt_scan_nodes(void) /* Handle linux,usable-memory-range property */ early_init_dt_check_for_usable_mem_range(); + + /* Handle kexec handover */ + early_init_dt_check_kho(); } bool __init early_init_dt_scan(void *dt_virt, phys_addr_t dt_phys) diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 5b924597a4deb..1ee2d31816aeb 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -264,6 +264,43 @@ static inline int setup_ima_buffer(const struct kimage *image, void *fdt, } #endif /* CONFIG_IMA_KEXEC */ +static int kho_add_chosen(const struct kimage *image, void *fdt, int chosen_node) +{ + int ret = 0; +#ifdef CONFIG_KEXEC_HANDOVER + phys_addr_t fdt_mem = 0; + phys_addr_t fdt_len = 0; + phys_addr_t scratch_mem = 0; + phys_addr_t scratch_len = 0; + + ret = fdt_delprop(fdt, chosen_node, "linux,kho-fdt"); + if (ret && ret != -FDT_ERR_NOTFOUND) + return ret; + ret = fdt_delprop(fdt, chosen_node, "linux,kho-scratch"); + if (ret && ret != -FDT_ERR_NOTFOUND) + return ret; + + if (!image->kho.fdt || !image->kho.scratch) + return 0; + + fdt_mem = image->kho.fdt; + fdt_len = PAGE_SIZE; + scratch_mem = image->kho.scratch->mem; + scratch_len = image->kho.scratch->bufsz; + + pr_debug("Adding kho metadata to DT"); + + ret = fdt_appendprop_addrrange(fdt, 0, chosen_node, "linux,kho-fdt", + fdt_mem, fdt_len); + if (ret) + return ret; + ret = fdt_appendprop_addrrange(fdt, 0, chosen_node, "linux,kho-scratch", + scratch_mem, scratch_len); + +#endif /* CONFIG_KEXEC_HANDOVER */ + return ret; +} + /* * of_kexec_alloc_and_setup_fdt - Alloc and setup a new Flattened Device Tree * @@ -414,6 +451,11 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, #endif } + /* Add kho metadata if this is a KHO image */ + ret = kho_add_chosen(image, fdt, chosen_node); + if (ret) + goto out; + /* add bootargs */ if (cmdline) { ret = fdt_setprop_string(fdt, chosen_node, "bootargs", cmdline); diff --git a/fs/dax.c b/fs/dax.c index af5045b0f476e..5087ca3b1f7bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -396,6 +396,7 @@ static inline unsigned long dax_folio_put(struct folio *folio) order = folio_order(folio); if (!order) return 0; + folio_reset_order(folio); for (i = 0; i < (1UL << order); i++) { struct dev_pagemap *pgmap = page_pgmap(&folio->page); @@ -1421,8 +1422,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } - pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); + pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); diff --git a/fs/proc/page.c b/fs/proc/page.c index 23fc771100ae5..999af26c72985 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -22,6 +22,12 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +enum kpage_operation { + KPAGE_FLAGS, + KPAGE_COUNT, + KPAGE_CGROUP, +}; + static inline unsigned long get_max_dump_pfn(void) { #ifdef CONFIG_SPARSEMEM @@ -37,19 +43,17 @@ static inline unsigned long get_max_dump_pfn(void) #endif } -/* /proc/kpagecount - an array exposing page mapcounts - * - * Each entry is a u64 representing the corresponding - * physical page mapcount. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t kpage_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + enum kpage_operation op) { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; + struct page *page; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; + u64 info; pfn = src / KPMSIZE; if (src & KPMMASK || count & KPMMASK) @@ -59,24 +63,34 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { - struct page *page; - u64 mapcount = 0; - /* * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ page = pfn_to_online_page(pfn); - if (page) { - struct folio *folio = page_folio(page); - if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) - mapcount = folio_precise_page_mapcount(folio, page); - else - mapcount = folio_average_page_mapcount(folio); - } - - if (put_user(mapcount, out)) { + if (page) { + switch (op) { + case KPAGE_FLAGS: + info = stable_page_flags(page); + break; + case KPAGE_COUNT: + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + info = folio_precise_page_mapcount(page_folio(page), page); + else + info = folio_average_page_mapcount(page_folio(page)); + break; + case KPAGE_CGROUP: + info = page_cgroup_ino(page); + break; + default: + info = 0; + break; + } + } else + info = 0; + + if (put_user(info, out)) { ret = -EFAULT; break; } @@ -94,17 +108,23 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } +/* /proc/kpagecount - an array exposing page mapcounts + * + * Each entry is a u64 representing the corresponding + * physical page mapcount. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return kpage_read(file, buf, count, ppos, KPAGE_COUNT); +} + static const struct proc_ops kpagecount_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; -/* /proc/kpageflags - an array exposing page flags - * - * Each entry is a u64 representing the corresponding - * physical page flags. - */ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { @@ -225,47 +245,17 @@ u64 stable_page_flags(const struct page *page) #endif return u; -}; +} +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - struct page *page = pfn_to_online_page(pfn); - - if (put_user(stable_page_flags(page), out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_FLAGS); } static const struct proc_ops kpageflags_proc_ops = { @@ -276,53 +266,10 @@ static const struct proc_ops kpageflags_proc_ops = { #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 ino; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - ppage = pfn_to_online_page(pfn); - - if (ppage) - ino = page_cgroup_ino(ppage); - else - ino = 0; - - if (put_user(ino, out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_CGROUP); } - static const struct proc_ops kpagecgroup_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994cde10e3f4d..b9e4fbbdf6e67 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2087,7 +2087,8 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -2128,12 +2129,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pte_to_swp_entry(pte); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) - categories |= PAGE_IS_FILE; - } + swp = pte_to_swp_entry(pte); + if (is_guard_swp_entry(swp)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + is_pfn_swap_entry(swp) && + !folio_test_anon(pfn_swap_entry_folio(swp))) + categories |= PAGE_IS_FILE; + if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 2afc95bf1655f..3e0a8fe9b1082 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -5,11 +5,6 @@ #include #include -static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) -{ - return mk_pte(page, pgprot); -} - static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 892ece4558a20..3c8ec3bfea447 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -23,6 +23,11 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) if (!ptdesc) return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -48,7 +53,7 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - pagetable_free(virt_to_ptdesc(pte)); + pagetable_dtor_free(virt_to_ptdesc(pte)); } /** @@ -70,7 +75,7 @@ static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(ptdesc)) { + if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } @@ -137,7 +142,7 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; - if (!pagetable_pmd_ctor(ptdesc)) { + if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 182b039ce5fa1..c5a3ad53beec6 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -37,6 +37,20 @@ struct pt_regs; */ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); +/** + * syscall_set_nr - change the system call a task is executing + * @task: task of interest, must be blocked + * @regs: task_pt_regs() of @task + * @nr: system call number + * + * Changes the system call number @task is about to execute. + * + * It's only valid to call this when @task is stopped for tracing on + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. + */ +void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr); + /** * syscall_rollback - roll back registers after an aborted system call * @task: task of interest, must be in system call exit tracing @@ -117,6 +131,22 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); +/** + * syscall_set_arguments - change system call parameter value + * @task: task of interest, must be in system call entry tracing + * @regs: task_pt_regs() of @task + * @args: array of argument values to store + * + * Changes 6 arguments to the system call. + * The first argument gets value @args[0], and so on. + * + * It's only valid to call this when @task is stopped for tracing on + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. + */ +void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args); + /** * syscall_get_arch - return the AUDIT_ARCH for the current system call * @task: task of interest, must be blocked diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8e7af9a03b41d..e721148c95d07 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -249,6 +249,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && + (inode->i_sb->s_iflags & SB_I_CGROUPWB) && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e893d546a49f4..f190998b2ebd0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -495,8 +495,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); -#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) - static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8f3ac832ee7f3..a57bed83c657b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -695,7 +695,7 @@ struct huge_bootmem_page { bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, @@ -1083,7 +1083,7 @@ static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, return NULL; } -static inline int isolate_or_dissolve_huge_page(struct page *page, +static inline int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { return -ENOMEM; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c8971861521a5..d3cad108b8076 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -371,6 +371,13 @@ struct kimage { size_t ima_buffer_size; #endif +#ifdef CONFIG_KEXEC_HANDOVER + struct { + struct kexec_segment *scratch; + phys_addr_t fdt; + } kho; +#endif + /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_sz; diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h new file mode 100644 index 0000000000000..d96ba4fb27704 --- /dev/null +++ b/include/linux/kexec_handover.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_H +#define LINUX_KEXEC_HANDOVER_H + +#include + +struct kho_scratch { + phys_addr_t addr; + phys_addr_t size; +}; + +/* KHO Notifier index */ +enum kho_event { + KEXEC_KHO_FINALIZE = 0, + KEXEC_KHO_ABORT = 1, +}; + +struct folio; +struct notifier_block; + +#define DECLARE_KHOSER_PTR(name, type) \ + union { \ + phys_addr_t phys; \ + type ptr; \ + } name +#define KHOSER_STORE_PTR(dest, val) \ + ({ \ + typeof(val) v = val; \ + typecheck(typeof((dest).ptr), v); \ + (dest).phys = virt_to_phys(v); \ + }) +#define KHOSER_LOAD_PTR(src) \ + ({ \ + typeof(src) s = src; \ + (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ + }) + +struct kho_serialization; + +#ifdef CONFIG_KEXEC_HANDOVER +bool kho_is_enabled(void); + +int kho_preserve_folio(struct kho_serialization *ser, struct folio *folio); +int kho_preserve_phys(struct kho_serialization *ser, phys_addr_t phys, + size_t size); +struct folio *kho_restore_folio(phys_addr_t phys); +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys); + +int register_kho_notifier(struct notifier_block *nb); +int unregister_kho_notifier(struct notifier_block *nb); + +void kho_memory_init(void); + +void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, + u64 scratch_len); +#else +static inline bool kho_is_enabled(void) +{ + return false; +} + +static inline int kho_preserve_folio(struct kho_serialization *ser, + struct folio *folio) +{ + return -EOPNOTSUPP; +} + +static inline int kho_preserve_phys(struct kho_serialization *ser, + phys_addr_t phys, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline struct folio *kho_restore_folio(phys_addr_t phys) +{ + return NULL; +} + +static inline int kho_add_subtree(struct kho_serialization *ser, + const char *name, void *fdt) +{ + return -EOPNOTSUPP; +} + +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + return -EOPNOTSUPP; +} + +static inline int register_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline int unregister_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline void kho_memory_init(void) +{ +} + +static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ +} +#endif /* CONFIG_KEXEC_HANDOVER */ + +#endif /* LINUX_KEXEC_HANDOVER_H */ diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 1a0bc35839e36..16a2ee4f8310b 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -52,44 +52,23 @@ __local_unlock_irqrestore(lock, flags) /** - * localtry_lock_init - Runtime initialize a lock instance - */ -#define localtry_lock_init(lock) __localtry_lock_init(lock) - -/** - * localtry_lock - Acquire a per CPU local lock - * @lock: The lock variable - */ -#define localtry_lock(lock) __localtry_lock(lock) - -/** - * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts - * @lock: The lock variable - */ -#define localtry_lock_irq(lock) __localtry_lock_irq(lock) - -/** - * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable - * interrupts - * @lock: The lock variable - * @flags: Storage for interrupt flags + * local_lock_init - Runtime initialize a lock instance */ -#define localtry_lock_irqsave(lock, flags) \ - __localtry_lock_irqsave(lock, flags) +#define local_trylock_init(lock) __local_trylock_init(lock) /** - * localtry_trylock - Try to acquire a per CPU local lock. + * local_trylock - Try to acquire a per CPU local lock * @lock: The lock variable * * The function can be used in any context such as NMI or HARDIRQ. Due to * locking constrains it will _always_ fail to acquire the lock in NMI or * HARDIRQ context on PREEMPT_RT. */ -#define localtry_trylock(lock) __localtry_trylock(lock) +#define local_trylock(lock) __local_trylock(lock) /** - * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable - * interrupts if acquired + * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable + * interrupts if acquired * @lock: The lock variable * @flags: Storage for interrupt flags * @@ -97,29 +76,8 @@ * locking constrains it will _always_ fail to acquire the lock in NMI or * HARDIRQ context on PREEMPT_RT. */ -#define localtry_trylock_irqsave(lock, flags) \ - __localtry_trylock_irqsave(lock, flags) - -/** - * local_unlock - Release a per CPU local lock - * @lock: The lock variable - */ -#define localtry_unlock(lock) __localtry_unlock(lock) - -/** - * local_unlock_irq - Release a per CPU local lock and enable interrupts - * @lock: The lock variable - */ -#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock) - -/** - * localtry_unlock_irqrestore - Release a per CPU local lock and restore - * interrupt flags - * @lock: The lock variable - * @flags: Interrupt flags to restore - */ -#define localtry_unlock_irqrestore(lock, flags) \ - __localtry_unlock_irqrestore(lock, flags) +#define local_trylock_irqsave(lock, flags) \ + __local_trylock_irqsave(lock, flags) DEFINE_GUARD(local_lock, local_lock_t __percpu*, local_lock(_T), diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 67bd13d142fac..bf2bf40d7b18d 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -15,10 +15,11 @@ typedef struct { #endif } local_lock_t; +/* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { local_lock_t llock; - unsigned int acquired; -} localtry_lock_t; + u8 acquired; +} local_trylock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ @@ -29,6 +30,9 @@ typedef struct { }, \ .owner = NULL, +# define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ + .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); @@ -56,6 +60,7 @@ static inline void local_lock_debug_init(local_lock_t *l) } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) +# define LOCAL_TRYLOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } @@ -63,7 +68,7 @@ static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } -#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }} +#define INIT_LOCAL_TRYLOCK(lockname) { LOCAL_TRYLOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ do { \ @@ -76,6 +81,8 @@ do { \ local_lock_debug_init(lock); \ } while (0) +#define __local_trylock_init(lock) __local_lock_init(lock.llock) + #define __spinlock_nested_bh_init(lock) \ do { \ static struct lock_class_key __key; \ @@ -87,149 +94,117 @@ do { \ local_lock_debug_init(lock); \ } while (0) +#define __local_lock_acquire(lock) \ + do { \ + local_trylock_t *tl; \ + local_lock_t *l; \ + \ + l = (local_lock_t *)this_cpu_ptr(lock); \ + tl = (local_trylock_t *)l; \ + _Generic((lock), \ + local_trylock_t *: ({ \ + lockdep_assert(tl->acquired == 0); \ + WRITE_ONCE(tl->acquired, 1); \ + }), \ + default:(void)0); \ + local_lock_acquire(l); \ + } while (0) + #define __local_lock(lock) \ do { \ preempt_disable(); \ - local_lock_acquire(this_cpu_ptr(lock)); \ + __local_lock_acquire(lock); \ } while (0) #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ - local_lock_acquire(this_cpu_ptr(lock)); \ + __local_lock_acquire(lock); \ } while (0) #define __local_lock_irqsave(lock, flags) \ do { \ local_irq_save(flags); \ - local_lock_acquire(this_cpu_ptr(lock)); \ - } while (0) - -#define __local_unlock(lock) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - preempt_enable(); \ + __local_lock_acquire(lock); \ } while (0) -#define __local_unlock_irq(lock) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - local_irq_enable(); \ - } while (0) - -#define __local_unlock_irqrestore(lock, flags) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - local_irq_restore(flags); \ - } while (0) - -#define __local_lock_nested_bh(lock) \ - do { \ - lockdep_assert_in_softirq(); \ - local_lock_acquire(this_cpu_ptr(lock)); \ - } while (0) - -#define __local_unlock_nested_bh(lock) \ - local_lock_release(this_cpu_ptr(lock)) - -/* localtry_lock_t variants */ - -#define __localtry_lock_init(lock) \ -do { \ - __local_lock_init(&(lock)->llock); \ - WRITE_ONCE((lock)->acquired, 0); \ -} while (0) - -#define __localtry_lock(lock) \ - do { \ - localtry_lock_t *lt; \ - preempt_disable(); \ - lt = this_cpu_ptr(lock); \ - local_lock_acquire(<->llock); \ - WRITE_ONCE(lt->acquired, 1); \ - } while (0) - -#define __localtry_lock_irq(lock) \ - do { \ - localtry_lock_t *lt; \ - local_irq_disable(); \ - lt = this_cpu_ptr(lock); \ - local_lock_acquire(<->llock); \ - WRITE_ONCE(lt->acquired, 1); \ - } while (0) - -#define __localtry_lock_irqsave(lock, flags) \ - do { \ - localtry_lock_t *lt; \ - local_irq_save(flags); \ - lt = this_cpu_ptr(lock); \ - local_lock_acquire(<->llock); \ - WRITE_ONCE(lt->acquired, 1); \ - } while (0) - -#define __localtry_trylock(lock) \ +#define __local_trylock(lock) \ ({ \ - localtry_lock_t *lt; \ - bool _ret; \ + local_trylock_t *tl; \ \ preempt_disable(); \ - lt = this_cpu_ptr(lock); \ - if (!READ_ONCE(lt->acquired)) { \ - WRITE_ONCE(lt->acquired, 1); \ - local_trylock_acquire(<->llock); \ - _ret = true; \ - } else { \ - _ret = false; \ + tl = this_cpu_ptr(lock); \ + if (READ_ONCE(tl->acquired)) { \ preempt_enable(); \ + tl = NULL; \ + } else { \ + WRITE_ONCE(tl->acquired, 1); \ + local_trylock_acquire( \ + (local_lock_t *)tl); \ } \ - _ret; \ + !!tl; \ }) -#define __localtry_trylock_irqsave(lock, flags) \ +#define __local_trylock_irqsave(lock, flags) \ ({ \ - localtry_lock_t *lt; \ - bool _ret; \ + local_trylock_t *tl; \ \ local_irq_save(flags); \ - lt = this_cpu_ptr(lock); \ - if (!READ_ONCE(lt->acquired)) { \ - WRITE_ONCE(lt->acquired, 1); \ - local_trylock_acquire(<->llock); \ - _ret = true; \ - } else { \ - _ret = false; \ + tl = this_cpu_ptr(lock); \ + if (READ_ONCE(tl->acquired)) { \ local_irq_restore(flags); \ + tl = NULL; \ + } else { \ + WRITE_ONCE(tl->acquired, 1); \ + local_trylock_acquire( \ + (local_lock_t *)tl); \ } \ - _ret; \ + !!tl; \ }) -#define __localtry_unlock(lock) \ +#define __local_lock_release(lock) \ + do { \ + local_trylock_t *tl; \ + local_lock_t *l; \ + \ + l = (local_lock_t *)this_cpu_ptr(lock); \ + tl = (local_trylock_t *)l; \ + local_lock_release(l); \ + _Generic((lock), \ + local_trylock_t *: ({ \ + lockdep_assert(tl->acquired == 1); \ + WRITE_ONCE(tl->acquired, 0); \ + }), \ + default:(void)0); \ + } while (0) + +#define __local_unlock(lock) \ do { \ - localtry_lock_t *lt; \ - lt = this_cpu_ptr(lock); \ - WRITE_ONCE(lt->acquired, 0); \ - local_lock_release(<->llock); \ + __local_lock_release(lock); \ preempt_enable(); \ } while (0) -#define __localtry_unlock_irq(lock) \ +#define __local_unlock_irq(lock) \ do { \ - localtry_lock_t *lt; \ - lt = this_cpu_ptr(lock); \ - WRITE_ONCE(lt->acquired, 0); \ - local_lock_release(<->llock); \ + __local_lock_release(lock); \ local_irq_enable(); \ } while (0) -#define __localtry_unlock_irqrestore(lock, flags) \ +#define __local_unlock_irqrestore(lock, flags) \ do { \ - localtry_lock_t *lt; \ - lt = this_cpu_ptr(lock); \ - WRITE_ONCE(lt->acquired, 0); \ - local_lock_release(<->llock); \ + __local_lock_release(lock); \ local_irq_restore(flags); \ } while (0) +#define __local_lock_nested_bh(lock) \ + do { \ + lockdep_assert_in_softirq(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock_nested_bh(lock) \ + local_lock_release(this_cpu_ptr(lock)) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -237,16 +212,18 @@ do { \ * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; -typedef spinlock_t localtry_lock_t; +typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) -#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname) +#define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define __local_lock_init(l) \ do { \ local_spin_lock_init((l)); \ } while (0) +#define __local_trylock_init(l) __local_lock_init(l) + #define __local_lock(__lock) \ do { \ migrate_disable(); \ @@ -283,17 +260,7 @@ do { \ spin_unlock(this_cpu_ptr((lock))); \ } while (0) -/* localtry_lock_t variants */ - -#define __localtry_lock_init(lock) __local_lock_init(lock) -#define __localtry_lock(lock) __local_lock(lock) -#define __localtry_lock_irq(lock) __local_lock(lock) -#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags) -#define __localtry_unlock(lock) __local_unlock(lock) -#define __localtry_unlock_irq(lock) __local_unlock(lock) -#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags) - -#define __localtry_trylock(lock) \ +#define __local_trylock(lock) \ ({ \ int __locked; \ \ @@ -308,11 +275,11 @@ do { \ __locked; \ }) -#define __localtry_trylock_irqsave(lock, flags) \ +#define __local_trylock_irqsave(lock, flags) \ ({ \ typecheck(unsigned long, flags); \ flags = 0; \ - __localtry_trylock(lock); \ + __local_trylock(lock); \ }) #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index cbbcd18d41868..9ef1290382249 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -463,6 +463,8 @@ struct ma_wr_state { void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ + unsigned char vacant_height; /* Height of lowest node with free space */ + unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) @@ -498,6 +500,8 @@ struct ma_wr_state { .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ + .vacant_height = 0, \ + .sufficient_height = 0 \ } #define MA_TOPIARY(name, tree) \ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ef5a1ecc6e595..bb19a25342246 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,6 +42,14 @@ extern unsigned long long max_possible_pfn; * kernel resource tree. * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are * not initialized (only for reserved regions). + * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use, + * either explictitly with memblock_reserve_kern() or via memblock + * allocation APIs. All memblock allocations set this flag. + * @MEMBLOCK_KHO_SCRATCH: memory region that kexec can pass to the next + * kernel in handover mode. During early boot, we do not know about all + * memory reservations yet, so we get scratch memory from the previous + * kernel that we know is good to use. It is the only memory that + * allocations may happen from in this phase. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ @@ -50,6 +58,8 @@ enum memblock_flags { MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ + MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */ + MEMBLOCK_KHO_SCRATCH = 0x40, /* scratch memory for kexec handover */ }; /** @@ -116,7 +126,19 @@ int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_phys_free(phys_addr_t base, phys_addr_t size); -int memblock_reserve(phys_addr_t base, phys_addr_t size); +int __memblock_reserve(phys_addr_t base, phys_addr_t size, int nid, + enum memblock_flags flags); + +static __always_inline int memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return __memblock_reserve(base, size, NUMA_NO_NODE, 0); +} + +static __always_inline int memblock_reserve_kern(phys_addr_t base, phys_addr_t size) +{ + return __memblock_reserve(base, size, NUMA_NO_NODE, MEMBLOCK_RSRV_KERN); +} + #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif @@ -132,6 +154,8 @@ int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); +int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size); +int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); @@ -275,6 +299,11 @@ static inline bool memblock_is_driver_managed(struct memblock_region *m) return m->flags & MEMBLOCK_DRIVER_MANAGED; } +static inline bool memblock_is_kho_scratch(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_KHO_SCRATCH; +} + int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -476,6 +505,7 @@ static inline __init_memblock bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); +phys_addr_t memblock_reserved_kern_size(phys_addr_t limit, int nid); unsigned long memblock_estimated_nr_free_pages(void); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); @@ -602,5 +632,14 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } static inline void memtest_report_meminfo(struct seq_file *m) { } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +void memblock_set_kho_scratch_only(void); +void memblock_clear_kho_scratch_only(void); +void memmap_init_kho_scratch_pages(void); +#else +static inline void memblock_set_kho_scratch_only(void) { } +static inline void memblock_clear_kho_scratch_only(void) { } +static inline void memmap_init_kho_scratch_pages(void) {} +#endif #endif /* _LINUX_MEMBLOCK_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 53364526d8775..5264d148bdd9d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1793,6 +1793,10 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, { } +static inline ino_t page_cgroup_ino(struct page *page) +{ + return 0; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/include/linux/memory.h b/include/linux/memory.h index 12daa6ec7d09f..5ec4e6d209b96 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -149,6 +149,14 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } +static inline int memory_block_advise_max_size(unsigned long size) +{ + return -ENODEV; +} +static inline unsigned long memory_block_advised_max_size(void) +{ + return 0; +} #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -181,6 +189,8 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, void memory_block_add_nid(struct memory_block *mem, int nid, enum meminit_context context); #endif /* CONFIG_NUMA */ +int memory_block_advise_max_size(unsigned long size); +unsigned long memory_block_advised_max_size(void); #endif /* CONFIG_MEMORY_HOTPLUG */ /* diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954b..5eb0d77c44389 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1218,6 +1219,23 @@ static inline unsigned int folio_order(const struct folio *folio) return folio_large_order(folio); } +/** + * folio_reset_order - Reset the folio order and derived _nr_pages + * @folio: The folio. + * + * Reset the order and derived _nr_pages to 0. Must only be used in the + * process of splitting large folios. + */ +static inline void folio_reset_order(struct folio *folio) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + folio->_flags_1 &= ~0xffUL; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 0; +#endif +} + #include /* @@ -1987,6 +2005,45 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } +#ifdef CONFIG_MMU +static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +{ + return pfn_pte(page_to_pfn(page), pgprot); +} + +/** + * folio_mk_pte - Create a PTE for this folio + * @folio: The folio to create a PTE for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_ptes(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +{ + return pfn_pte(folio_pfn(folio), pgprot); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/** + * folio_mk_pmd - Create a PMD for this folio + * @folio: The folio to create a PMD for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_pmd_at(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +{ + return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); +} +#endif +#endif /* CONFIG_MMU */ + static inline bool folio_has_pincount(const struct folio *folio) { if (IS_ENABLED(CONFIG_64BIT)) @@ -2167,15 +2224,6 @@ static inline long compound_nr(struct page *page) return folio_large_nr_pages(folio); } -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline long thp_nr_pages(struct page *page) -{ - return folio_nr_pages((struct folio *)page); -} - /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. @@ -2389,7 +2437,6 @@ static inline void clear_page_pfmemalloc(struct page *page) extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) -#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* @@ -2750,7 +2797,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); - if ((mm)->hiwater_rss < _rss) + if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } @@ -3100,9 +3147,10 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) pagetable_free(ptdesc); } -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +static inline bool pagetable_pte_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { - if (!ptlock_init(ptdesc)) + if (mm != &init_mm && !ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; @@ -3206,9 +3254,10 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) +static inline bool pagetable_pmd_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(ptdesc)) + if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 25e80b2ca7f41..6ccec1bf2896f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -967,6 +967,9 @@ struct zone { #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; + + /* To be called once the last page in the zone is accepted */ + struct work_struct unaccepted_cleanup; #endif /* zone flags, see below */ @@ -1499,8 +1502,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index dd85613cdd86a..991076cba7c50 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -22,6 +22,7 @@ struct numa_meminfo { }; int __init numa_add_memblk(int nodeid, u64 start, u64 end); +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end); void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e6a21b62dccee..d3909cb1e5766 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -982,7 +982,7 @@ static inline bool page_mapcount_is_type(unsigned int mapcount) static inline bool page_has_type(const struct page *page) { - return page_mapcount_is_type(data_race(page->page_type)); + return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26baa78f1ca7d..af25fb640463d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -945,19 +945,6 @@ static inline bool folio_contains(struct folio *folio, pgoff_t index) return index - folio_index(folio) < folio_nr_pages(folio); } -/* - * Given the page we found in the page cache, return the page corresponding - * to this index in the file - */ -static inline struct page *find_subpage(struct page *head, pgoff_t index) -{ - /* HugeTLBfs wants the head page regardless */ - if (PageHuge(head)) - return head; - - return head + (index & (thp_nr_pages(head) - 1)); -} - unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, @@ -1308,9 +1295,9 @@ static inline bool filemap_range_needs_writeback(struct address_space *mapping, * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which - * implement the ->readahead method should call readahead_page() or - * readahead_page_batch() in a loop and attempt to start I/O against - * each page in the request. + * implement the ->readahead method should call readahead_folio() or + * __readahead_batch() in a loop and attempt to start reads into each + * folio in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. @@ -1415,22 +1402,6 @@ static inline struct folio *__readahead_folio(struct readahead_control *ractl) return folio; } -/** - * readahead_page - Get the next page to read. - * @ractl: The current readahead request. - * - * Context: The page is locked and has an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: A pointer to the next page, or %NULL if we are done. - */ -static inline struct page *readahead_page(struct readahead_control *ractl) -{ - struct folio *folio = __readahead_folio(ractl); - - return &folio->page; -} - /** * readahead_folio - Get the next folio to read. * @ractl: The current readahead request. @@ -1453,7 +1424,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); - struct page *page; + struct folio *folio; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; @@ -1462,13 +1433,12 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); - xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, folio)) continue; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - array[i++] = page; - rac->_batch_count += thp_nr_pages(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + array[i++] = folio_page(folio, 0); + rac->_batch_count += folio_nr_pages(folio); if (i == array_sz) break; } @@ -1477,20 +1447,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, return i; } -/** - * readahead_page_batch - Get a batch of pages to read. - * @rac: The current readahead request. - * @array: An array of pointers to struct page. - * - * Context: The pages are locked and have an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: The number of pages placed in the array. 0 indicates the request - * is complete. - */ -#define readahead_page_batch(rac, array) \ - __readahead_batch(rac, array, ARRAY_SIZE(array)) - /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0069ba6866a48..947ad12dfdbe4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -144,7 +144,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x000fffff +#define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); @@ -157,7 +157,9 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 config1; u64 last_tag; + u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; @@ -325,6 +327,9 @@ struct perf_output_handle; struct pmu { struct list_head entry; + spinlock_t events_lock; + struct list_head events; + struct module *module; struct device *dev; struct device *parent; @@ -622,9 +627,10 @@ struct perf_addr_filter_range { * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DEAD = -4, - PERF_EVENT_STATE_EXIT = -3, - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_DEAD = -5, + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, @@ -865,6 +871,7 @@ struct perf_event { void *security; #endif struct list_head sb_list; + struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- @@ -1155,7 +1162,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -extern void perf_pmu_unregister(struct pmu *pmu); +extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); @@ -1760,7 +1767,7 @@ static inline bool needs_branch_stack(struct perf_event *event) static inline bool has_aux(struct perf_event *event) { - return event->pmu->setup_aux; + return event->pmu && event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e2b705c149454..b50447ef1c921 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1511,8 +1511,9 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, /* * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page - * tables copied during copy_page_range(). On success, stores the pfn to be - * passed to untrack_pfn_copy(). + * tables copied during copy_page_range(). Will store the pfn to be + * passed to untrack_pfn_copy() only if there is something to be untracked. + * Callers should initialize the pfn to 0. */ static inline int track_pfn_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long *pfn) @@ -1522,7 +1523,9 @@ static inline int track_pfn_copy(struct vm_area_struct *dst_vma, /* * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during - * copy_page_range(), but after track_pfn_copy() was already called. + * copy_page_range(), but after track_pfn_copy() was already called. Can + * be called even if track_pfn_copy() did not actually track anything: + * handled internally. */ static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 8dbd51ea86267..240bd3bff18dd 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -11,10 +11,17 @@ struct ptdump_range { }; struct ptdump_state { - /* level is 0:PGD to 4:PTE, or -1 if unknown */ - void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, u64 val); - void (*effective_prot)(struct ptdump_state *st, int level, u64 val); + void (*note_page_pte)(struct ptdump_state *st, unsigned long addr, pte_t pte); + void (*note_page_pmd)(struct ptdump_state *st, unsigned long addr, pmd_t pmd); + void (*note_page_pud)(struct ptdump_state *st, unsigned long addr, pud_t pud); + void (*note_page_p4d)(struct ptdump_state *st, unsigned long addr, p4d_t p4d); + void (*note_page_pgd)(struct ptdump_state *st, unsigned long addr, pgd_t pgd); + void (*note_page_flush)(struct ptdump_state *st); + void (*effective_prot_pte)(struct ptdump_state *st, pte_t pte); + void (*effective_prot_pmd)(struct ptdump_state *st, pmd_t pmd); + void (*effective_prot_pud)(struct ptdump_state *st, pud_t pud); + void (*effective_prot_p4d)(struct ptdump_state *st, p4d_t p4d); + void (*effective_prot_pgd)(struct ptdump_state *st, pgd_t pgd); const struct ptdump_range *range; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac19828934..1c50e30b5c01d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,10 @@ struct sched_statistics { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; +#ifdef CONFIG_NUMA_BALANCING + u64 numa_task_migrated; + u64 numa_task_swapped; +#endif u64 nr_wakeups; u64 nr_wakeups_sync; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e5603cc91963d..b0cc60f1c458a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2e46b69ff0a6f..d4c1fed9a9e4e 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,16 +186,26 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *data); + extern void __init uprobes_init(void); -extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct vm_area_struct *vma, unsigned long vaddr, + uprobe_opcode_t opcode, bool is_register); +extern int uprobe_write(struct vm_area_struct *vma, const unsigned long insn_vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38e..91a3ce9a2687e 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, + NUMA_TASK_MIGRATE, + NUMA_TASK_SWAP, #endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 78eede109b1a5..be850174e802e 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -965,10 +965,12 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, @@ -981,7 +983,7 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); - return err; + return err < 0 ? err : 0; } /** @@ -1002,10 +1004,12 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, @@ -1018,7 +1022,7 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); - return err; + return err < 0 ? err : 0; } /** @@ -1039,10 +1043,12 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, @@ -1055,7 +1061,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); - return err; + return err < 0 ? err : 0; } /** diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 52f30e526607f..369ef068fad8e 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -22,7 +22,7 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void zpool_free(struct zpool *pool, unsigned long handle); @@ -64,7 +64,7 @@ struct zpool_driver { void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void (*free)(void *pool, unsigned long handle); void *(*obj_read_begin)(void *pool, unsigned long handle, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index c26baf9fb331b..13e9cc5490f71 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -26,7 +26,8 @@ struct zs_pool; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags, + const int nid); void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 28705ae677849..71d5ac83cf5da 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1506,7 +1506,7 @@ union bpf_attr { __s32 map_token_fd; }; - struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ __u32 map_fd; __aligned_u64 key; union { @@ -1995,11 +1995,15 @@ union bpf_attr { * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet - * associated to *skb*, at *offset*. *flags* are a combination of - * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the - * checksum for the packet after storing the bytes) and - * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ - * **->swhash** and *skb*\ **->l4hash** to 0). + * associated to *skb*, at *offset*. The *flags* are a combination + * of the following values: + * + * **BPF_F_RECOMPUTE_CSUM** + * Automatically update *skb*\ **->csum** after storing the + * bytes. + * **BPF_F_INVALIDATE_HASH** + * Set *skb*\ **->hash**, *skb*\ **->swhash** and *skb*\ + * **->l4hash** to 0. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -2051,7 +2055,7 @@ union bpf_attr { * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates - * the checksum is to be computed against a pseudo-header. + * that the modified header field is part of the pseudo-header. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index e762e1af650c4..0098b0ce8ccb1 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -361,6 +361,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index 72c038fc71d09..5f8ef61567523 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -74,6 +74,7 @@ struct seccomp_metadata { }; #define PTRACE_GET_SYSCALL_INFO 0x420e +#define PTRACE_SET_SYSCALL_INFO 0x4212 #define PTRACE_SYSCALL_INFO_NONE 0 #define PTRACE_SYSCALL_INFO_ENTRY 1 #define PTRACE_SYSCALL_INFO_EXIT 2 @@ -81,7 +82,8 @@ struct seccomp_metadata { struct ptrace_syscall_info { __u8 op; /* PTRACE_SYSCALL_INFO_* */ - __u8 pad[3]; + __u8 reserved; + __u16 flags; __u32 arch; __u64 instruction_pointer; __u64 stack_pointer; @@ -98,6 +100,7 @@ struct ptrace_syscall_info { __u64 nr; __u64 args[6]; __u32 ret_data; + __u32 reserved2; } seccomp; }; }; @@ -142,6 +145,8 @@ struct ptrace_sud_config { __u64 len; }; +/* 0x4212 is PTRACE_SET_SYSCALL_INFO */ + /* * These values are stored in task->ptrace_message * by ptrace_stop to describe the current syscall-stop. diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 4d111f8719516..4fa212909d699 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -95,6 +95,20 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select DEBUG_FS + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 434929de17ef2..97c09847db42f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5a5adc66b8e22..2e18d7e50d9b6 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + roundup(key_size, 8)); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -196,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) @@ -215,10 +229,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -245,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -453,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -549,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; @@ -670,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -709,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -763,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -968,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1039,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1072,10 +1079,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1106,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1134,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1156,24 +1162,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } } htab_unlock_bucket(b, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); - } + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: htab_unlock_bucket(b, flags); @@ -1220,8 +1216,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); ret = htab_lock_bucket(b, &flags); if (ret) @@ -1255,13 +1250,14 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1292,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + /* Update value in-place */ + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -1383,7 +1387,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -1500,10 +1504,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1615,15 +1619,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1680,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1720,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1812,8 +1812,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, off += size; } } else { - value = l->key + roundup_key_size; - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + value = htab_elem_value(l, key_size); + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ @@ -2063,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2077,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2165,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2180,7 +2178,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2196,7 +2193,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, @@ -2411,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; @@ -2536,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); diff --git a/kernel/events/core.c b/kernel/events/core.c index e93c195659140..07414cb1279bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -208,6 +208,7 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, } #define TASK_TOMBSTONE ((void *)-1L) +#define EVENT_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { @@ -1270,6 +1271,10 @@ static void put_ctx(struct perf_event_context *ctx) if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); + } else { + smp_mb__after_atomic(); /* pairs with wait_var_event() */ + if (ctx->task == TASK_TOMBSTONE) + wake_up_var(&ctx->refcount); } } @@ -2325,10 +2330,20 @@ static void perf_child_detach(struct perf_event *event) if (WARN_ON_ONCE(!parent_event)) return; + /* + * Can't check this from an IPI, the holder is likey another CPU. + * lockdep_assert_held(&parent_event->child_mutex); + */ sync_child_event(event); list_del_init(&event->child_list); + /* + * Cannot set to NULL, as that would confuse the situation vs + * not being a child event. See for example unaccount_event(). + */ + event->parent = EVENT_TOMBSTONE; + put_event(parent_event); } static bool is_orphaned_event(struct perf_event *event) @@ -2450,8 +2465,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL -#define DETACH_EXIT 0x08UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2477,12 +2493,15 @@ __perf_remove_from_context(struct perf_event *event, */ if (flags & DETACH_EXIT) state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD) { event->pending_disable = 1; state = PERF_EVENT_STATE_DEAD; } event_sched_out(event, ctx); perf_event_set_state(event, min(event->state, state)); + if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) @@ -4516,7 +4535,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4543,7 +4563,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -5125,6 +5145,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5521,6 +5542,8 @@ static void perf_free_addr_filters(struct perf_event *event); /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5550,6 +5573,7 @@ static void __free_event(struct perf_event *event) * put_pmu_ctx() needs an event->ctx reference, because of * epc->ctx. */ + WARN_ON_ONCE(!pmu); WARN_ON_ONCE(!event->ctx); WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); @@ -5562,8 +5586,13 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) - module_put(event->pmu->module); + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5598,22 +5627,6 @@ static void _free_event(struct perf_event *event) __free_event(event); } -/* - * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. - */ -static void free_event(struct perf_event *event) -{ - if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { - /* leak to avoid use-after-free */ - return; - } - - _free_event(event); -} - /* * Remove user event from the owner task. */ @@ -5676,7 +5689,7 @@ static void put_event(struct perf_event *event) _free_event(event); /* Matches the refcount bump in inherit_event() */ - if (parent) + if (parent && parent != EVENT_TOMBSTONE) put_event(parent); } @@ -5689,7 +5702,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; - LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not @@ -5718,15 +5730,17 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { - void *var = NULL; - /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). @@ -5759,44 +5773,24 @@ int perf_event_release_kernel(struct perf_event *event) tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { - perf_remove_from_context(child, DETACH_GROUP); - list_move(&child->child_list, &free_list); + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); } else { - var = &ctx->refcount; + child = NULL; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); - put_ctx(ctx); - if (var) { - /* - * If perf_event_free_task() has deleted all events from the - * ctx while the child_mutex got released above, make sure to - * notify about the preceding put_ctx(). - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); + if (child) { + /* Last reference unless ->pending_task work is pending */ + put_event(child); } + put_ctx(ctx); + goto again; } mutex_unlock(&event->child_mutex); - list_for_each_entry_safe(child, tmp, &free_list, child_list) { - void *var = &child->ctx->refcount; - - list_del(&child->child_list); - /* Last reference unless ->pending_task work is pending */ - put_event(child); - - /* - * Wake any perf_event_free_task() waiting for this event to be - * freed. - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); - } - no_ctx: /* * Last reference unless ->pending_task work is pending on this event @@ -6068,8 +6062,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; @@ -6239,12 +6239,18 @@ static int perf_event_set_output(struct perf_event *event, static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6301,7 +6307,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog, 0); + err = __perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -6620,9 +6626,22 @@ void ring_buffer_put(struct perf_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); @@ -6630,8 +6649,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -6647,14 +6666,16 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event, vma->vm_mm); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex @@ -6847,6 +6868,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; long user_extra = 0, extra = 0; int ret, flags = 0; + mapped_f mapped; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6877,6 +6899,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&event->mmap_mutex); ret = -EINVAL; + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) { + ret = -ENODEV; + goto unlock; + } + if (vma->vm_pgoff == 0) { nr_pages -= 1; @@ -7055,8 +7087,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!ret) ret = map_range(rb, vma); - if (!ret && event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); return ret; } @@ -7067,6 +7100,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -11069,11 +11105,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, - u64 bpf_cookie) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); @@ -11108,6 +11148,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + void perf_event_free_bpf_prog(struct perf_event *event) { if (!event->prog) @@ -11130,7 +11184,15 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; @@ -12235,6 +12297,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ @@ -12248,21 +12313,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) { scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; + } + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12356,7 +12543,7 @@ static struct pmu *perf_init_event(struct perf_event *event) struct pmu *pmu; int type, ret; - guard(srcu)(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ /* * Save original type before calling pmu->event_init() since certain @@ -12580,6 +12767,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); @@ -12758,6 +12946,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return_ptr(event); } @@ -12957,6 +13152,9 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) @@ -13138,6 +13336,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { @@ -13145,6 +13348,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -13441,7 +13648,7 @@ SYSCALL_DEFINE5(perf_event_open, if (task) up_read(&task->signal->exec_update_lock); err_alloc: - free_event(event); + put_event(event); err_task: if (task) put_task_struct(task); @@ -13478,6 +13685,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -13549,7 +13761,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + put_event(event); err: return ERR_PTR(err); } @@ -13689,10 +13901,15 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; + bool is_child = !!parent_event; + + if (parent_event == EVENT_TOMBSTONE) + parent_event = NULL; if (parent_event) { /* @@ -13707,22 +13924,29 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } - perf_remove_from_context(event, detach_flags | DETACH_EXIT); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. */ - if (parent_event) { - mutex_unlock(&parent_event->child_mutex); - /* - * Kick perf_poll() for is_event_hup(); - */ - perf_event_wakeup(parent_event); - put_event(event); + if (is_child) { + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + */ + put_event(event); + } return; } @@ -13732,15 +13956,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *task, bool exit) { - struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - WARN_ON_ONCE(child != current); - - child_ctx = perf_pin_task_context(child); - if (!child_ctx) + ctx = perf_pin_task_context(task); + if (!ctx) return; /* @@ -13753,27 +13975,28 @@ static void perf_event_exit_task_context(struct task_struct *child) * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ - mutex_lock(&child_ctx->mutex); + mutex_lock(&ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ - raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); + raw_spin_lock_irq(&ctx->lock); + if (exit) + task_ctx_sched_out(ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp, NULL); - put_ctx(child_ctx); /* cannot be last */ - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); - put_task_struct(current); /* cannot be last */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + put_ctx(ctx); /* cannot be last */ + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ - clone_ctx = unclone_ctx(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irq(&ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -13783,28 +14006,48 @@ static void perf_event_exit_task_context(struct task_struct *child) * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ - perf_event_task(child, child_ctx, 0); + if (exit) + perf_event_task(task, ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) + perf_event_exit_event(child_event, ctx, false); - mutex_unlock(&child_ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(child_ctx); + if (!exit) { + /* + * perf_event_release_kernel() could still have a reference on + * this context. In that case we must wait for these events to + * have been freed (in particular all their references to this + * task must've been dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, + refcount_read(&ctx->refcount) == 1); + } + put_ctx(ctx); } /* - * When a child task exits, feed back event values to parent events. + * When a task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ -void perf_event_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *task) { struct perf_event *event, *tmp; - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, + WARN_ON_ONCE(task != current); + + mutex_lock(&task->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &task->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); @@ -13815,42 +14058,23 @@ void perf_event_exit_task(struct task_struct *child) */ smp_store_release(&event->owner, NULL); } - mutex_unlock(&child->perf_event_mutex); + mutex_unlock(&task->perf_event_mutex); - perf_event_exit_task_context(child); + perf_event_exit_task_context(task, true); /* * The perf_event_exit_task_context calls perf_event_task - * with child's task_ctx, which generates EXIT events for - * child contexts and sets child->perf_event_ctxp[] to NULL. + * with task's task_ctx, which generates EXIT events for + * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ - perf_event_task(child, NULL, 0); + perf_event_task(task, NULL, 0); /* * Detach the perf_ctx_data for the system-wide event. */ guard(percpu_read)(&global_ctx_data_rwsem); - detach_task_ctx_data(child); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - put_event(event); + detach_task_ctx_data(task); } /* @@ -13862,48 +14086,7 @@ static void perf_free_event(struct perf_event *event, */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = rcu_access_pointer(task->perf_event_ctxp); - if (!ctx) - return; - - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp, NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); - - - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - - mutex_unlock(&ctx->mutex); - - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ + perf_event_exit_task_context(task, false); } void perf_event_delayed_put(struct task_struct *task) @@ -13980,6 +14163,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, @@ -13993,7 +14184,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { - free_event(child_event); + put_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; @@ -14008,7 +14199,7 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - free_event(child_event); + put_event(child_event); return NULL; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d783b5882b6a..408a134c1a7b9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -29,6 +29,7 @@ #include #include #include /* check_stable_address_space */ +#include #include @@ -151,91 +152,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } -/** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @old_page: the page we are replacing by new_page - * @new_page: the modified page we replace page by - * - * If @new_page is NULL, only unmap @old_page. - * - * Returns 0 on success, negative error code otherwise. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *old_page, struct page *new_page) -{ - struct folio *old_folio = page_folio(old_page); - struct folio *new_folio; - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); - int err; - struct mmu_notifier_range range; - pte_t pte; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + PAGE_SIZE); - - if (new_page) { - new_folio = page_folio(new_page); - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); - if (err) - return err; - } - - /* For folio_free_swap() below */ - folio_lock(old_folio); - - mmu_notifier_invalidate_range_start(&range); - err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) - goto unlock; - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - pte = ptep_get(pvmw.pte); - - /* - * Handle PFN swap PTES, such as device-exclusive ones, that actually - * map pages: simply trigger GUP again to fix it up. - */ - if (unlikely(!pte_present(pte))) { - page_vma_mapped_walk_done(&pvmw); - goto unlock; - } - - if (new_page) { - folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(new_folio, vma); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); - - if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_folio)); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(pte)); - ptep_clear_flush(vma, addr, pvmw.pte); - if (new_page) - set_pte_at(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); - - folio_remove_rmap_pte(old_folio, old_page, vma); - if (!folio_mapped(old_folio)) - folio_free_swap(old_folio); - page_vma_mapped_walk_done(&pvmw); - folio_put(old_folio); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(&range); - folio_unlock(old_folio); - return err; -} - /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. @@ -261,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -275,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -289,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -463,6 +380,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + + folio_test_swapcache(folio) * folio_nr_pages(folio))) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -474,147 +480,165 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * @vma: the probed virtual memory area. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +int uprobe_write_opcode(struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t opcode, bool is_register) { - struct uprobe *uprobe; - struct page *old_page, *new_page; - struct vm_area_struct *vma; - int ret, is_register, ref_ctr_updated = 0; - bool orig_page_huge = false; + return uprobe_write(vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, NULL); +} + +int uprobe_write(struct vm_area_struct *vma, const unsigned long insn_vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, + bool is_register, void *data) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + struct mm_struct *mm = vma->vm_mm; + int ret; unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; - is_register = is_swbp_insn(&opcode); - uprobe = container_of(auprobe, struct uprobe, arch); + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; -retry: + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() + * cannot deal with PMDs yet. + */ if (is_register) - gup_flags |= FOLL_SPLIT_PMD; - /* Read the page with vaddr into memory */ - old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR(old_page)) - return PTR_ERR(old_page); + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; - ret = verify_opcode(old_page, vaddr, &opcode); +retry: + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - goto put_old; + goto out; + folio = page_folio(page); - if (is_zero_page(old_page)) { - ret = -EINVAL; - goto put_old; + ret = verify(page, insn_vaddr, insn, nbytes, data); + if (ret <= 0) { + folio_put(folio); + goto out; } - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { - ret = -EINVAL; - goto put_old; + ret = 0; + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; } - /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { - ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; - - ref_ctr_updated = 1; + if (!is_register) { + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); } - ret = 0; - if (!is_register && !PageAnon(old_page)) - goto put_old; - - ret = anon_vma_prepare(vma); - if (ret) - goto put_old; - - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; - - __SetPageUptodate(new_page); - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); - - if (!is_register) { - struct page *orig_page; - pgoff_t index; - - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); - - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, - index); - - if (orig_page) { - if (PageUptodate(orig_page) && - pages_identical(new_page, orig_page)) { - /* let go new_page */ - put_page(new_page); - new_page = NULL; - - if (PageCompound(orig_page)) - orig_page_huge = true; - } - put_page(orig_page); - } + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); + folio_walk_end(&fw, vma); } - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); - if (new_page) - put_page(new_page); -put_old: - put_page(old_page); + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; + default: + break; + } - /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); - +out: /* try collapse pmd for compound page */ - if (!ret && orig_page_huge) + if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); - return ret; + return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(vma, vaddr, UPROBE_SWBP_INSN, true); +} + +static int set_swbp_refctr(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + /* We are going to replace instruction, update ref_ctr. */ + if (uprobe->ref_ctr_offset) { + err = update_ref_ctr(uprobe, mm, 1); + if (err) + return err; + } + + err = set_swbp(&uprobe->arch, vma, vaddr); + + /* Revert back reference counter if instruction update failed. */ + if (err && uprobe->ref_ctr_offset) + update_ref_ctr(uprobe, mm, -1); + return err; } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) +{ + return uprobe_write_opcode(vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false); +} + +static int set_orig_refctr(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + int err = set_orig_insn(&uprobe->arch, vma, vaddr); + + /* Revert back reference counter even if instruction update failed. */ + if (uprobe->ref_ctr_offset) + update_ref_ctr(uprobe, vma->vm_mm, -1); + return err; } /* uprobe should have guaranteed positive refcount */ @@ -1047,7 +1071,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1134,10 +1158,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -1153,7 +1177,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp_refctr(uprobe, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -1162,11 +1186,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; + set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_refctr(uprobe, vma, vaddr); } struct map_info { @@ -1296,10 +1322,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); + err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: @@ -1391,7 +1417,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1457,7 +1483,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1472,9 +1498,9 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1610,7 +1636,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } @@ -1720,7 +1746,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1752,7 +1778,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1786,6 +1812,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1797,6 +1831,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -2387,7 +2423,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2671,6 +2707,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2735,6 +2775,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2746,6 +2789,23 @@ static void handle_swbp(struct pt_regs *regs) rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b8..4c2df38167286 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index fba686487e3b5..77758c5331229 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -253,6 +253,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); + /* If KHO is active, add its images to the list */ + ret = kho_fill_kimage(image); + if (ret) + goto out; + /* Call image load handler */ ldata = kexec_image_load_default(image); @@ -648,6 +653,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; + /* + * If KHO is active, only use KHO scratch memory. All other memory + * could potentially be handed over. + */ + ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback); + if (ret <= 0) + return ret; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c new file mode 100644 index 0000000000000..4bbc6f5ef7230 --- /dev/null +++ b/kernel/kexec_handover.c @@ -0,0 +1,1253 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover.c - kexec handover metadata processing + * Copyright (C) 2023 Alexander Graf + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport + * Copyright (C) 2025 Google LLC, Changyuan Lyu + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. + */ +#include "../mm/internal.h" +#include "kexec_internal.h" + +#define KHO_FDT_COMPATIBLE "kho-v1" +#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" +#define PROP_SUB_FDT "fdt" + +static bool kho_enable __ro_after_init; + +bool kho_is_enabled(void) +{ + return kho_enable; +} +EXPORT_SYMBOL_GPL(kho_is_enabled); + +static int __init kho_parse_enable(char *p) +{ + return kstrtobool(p, &kho_enable); +} +early_param("kho", kho_parse_enable); + +/* + * Keep track of memory that is to be preserved across KHO. + * + * The serializing side uses two levels of xarrays to manage chunks of per-order + * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a + * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations + * each bitmap will cover 16M of address space. Thus, for 16G of memory at most + * 512K of bitmap memory will be needed for order 0. + * + * This approach is fully incremental, as the serialization progresses folios + * can continue be aggregated to the tracker. The final step, immediately prior + * to kexec would serialize the xarray information into a linked list for the + * successor kernel to parse. + */ + +#define PRESERVE_BITS (512 * 8) + +struct kho_mem_phys_bits { + DECLARE_BITMAP(preserve, PRESERVE_BITS); +}; + +struct kho_mem_phys { + /* + * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized + * to order. + */ + struct xarray phys_bits; +}; + +struct kho_mem_track { + /* Points to kho_mem_phys, each order gets its own bitmap tree */ + struct xarray orders; +}; + +struct khoser_mem_chunk; + +struct kho_serialization { + struct page *fdt; + struct list_head fdt_list; + struct dentry *sub_fdt_dir; + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; +}; + +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +{ + void *elm, *res; + + elm = xa_load(xa, index); + if (elm) + return elm; + + elm = kzalloc(sz, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); + + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) + res = ERR_PTR(xa_err(res)); + + if (res) { + kfree(elm); + return res; + } + + return elm; +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); + if (!physxa) + continue; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (!bits) + continue; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + pfn += 1 << order; + } +} + +static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; + + might_sleep(); + + physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); + if (IS_ERR(physxa)) + return PTR_ERR(physxa); + + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, + sizeof(*bits)); + if (IS_ERR(bits)) + return PTR_ERR(bits); + + set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + return 0; +} + +/** + * kho_preserve_folio - preserve a folio across kexec. + * @ser: serialization control object passed by KHO notifiers. + * @folio: folio to preserve. + * + * Instructs KHO to preserve the whole folio across kexec. The order + * will be preserved as well. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_folio(struct kho_serialization *ser, struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + + return __kho_preserve_order(&ser->track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_preserve_folio); + +/** + * kho_preserve_phys - preserve a physically contiguous range across kexec. + * @ser: serialization control object passed by KHO notifiers. + * @phys: physical address of the range. + * @size: size of the range. + * + * Instructs KHO to preserve the memory range from @phys to @phys + @size + * across kexec. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_phys(struct kho_serialization *ser, phys_addr_t phys, + size_t size) +{ + unsigned long pfn = PHYS_PFN(phys); + unsigned long failed_pfn = 0; + const unsigned long start_pfn = pfn; + const unsigned long end_pfn = PHYS_PFN(phys + size); + int err = 0; + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + err = __kho_preserve_order(&ser->track, pfn, order); + if (err) { + failed_pfn = pfn; + break; + } + + pfn += 1 << order; + } + + if (err) + __kho_unpreserve(&ser->track, start_pfn, failed_pfn); + + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_phys); + +/* almost as free_reserved_page(), just don't free the page */ +static void kho_restore_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + adjust_managed_page_count(page, 1); +} + +/** + * kho_restore_folio - recreates the folio from the preserved memory. + * @phys: physical address of the folio. + * + * Return: pointer to the struct folio on success, NULL on failure. + */ +struct folio *kho_restore_folio(phys_addr_t phys) +{ + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned long order; + + if (!page) + return NULL; + + order = page->private; + if (order) { + if (order > MAX_PAGE_ORDER) + return NULL; + + prep_compound_page(page, order); + } else { + kho_restore_page(page); + } + + return page_folio(page); +} +EXPORT_SYMBOL_GPL(kho_restore_folio); + +/* Serialize and deserialize struct kho_mem_phys across kexec + * + * Record all the bitmaps in a linked list of pages for the next kernel to + * process. Each chunk holds bitmaps of the same order and each block of bitmaps + * starts at a given physical address. This allows the bitmaps to be sparse. The + * xarray is used to store them in a tree while building up the data structure, + * but the KHO successor kernel only needs to process them once in order. + * + * All of this memory is normal kmalloc() memory and is not marked for + * preservation. The successor kernel will remain isolated to the scratch space + * until it completes processing this list. Once processed all the memory + * storing these ranges will be marked as free. + */ + +struct khoser_mem_bitmap_ptr { + phys_addr_t phys_start; + DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); +}; + +struct khoser_mem_chunk_hdr { + DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); + unsigned int order; + unsigned int num_elms; +}; + +#define KHOSER_BITMAP_SIZE \ + ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ + sizeof(struct khoser_mem_bitmap_ptr)) + +struct khoser_mem_chunk { + struct khoser_mem_chunk_hdr hdr; + struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; +}; + +static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); + +static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) +{ + struct khoser_mem_chunk *chunk; + + chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!chunk) + return NULL; + chunk->hdr.order = order; + if (cur_chunk) + KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); + return chunk; +} + +static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +{ + struct khoser_mem_chunk *chunk = first_chunk; + + while (chunk) { + struct khoser_mem_chunk *tmp = chunk; + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + kfree(tmp); + } +} + +static int kho_mem_serialize(struct kho_serialization *ser) +{ + struct khoser_mem_chunk *first_chunk = NULL; + struct khoser_mem_chunk *chunk = NULL; + struct kho_mem_phys *physxa; + unsigned long order; + + xa_for_each(&ser->track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + + if (!first_chunk) + first_chunk = chunk; + + xa_for_each(&physxa->phys_bits, phys, bits) { + struct khoser_mem_bitmap_ptr *elm; + + if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + } + + elm = &chunk->bitmaps[chunk->hdr.num_elms]; + chunk->hdr.num_elms++; + elm->phys_start = (phys * PRESERVE_BITS) + << (order + PAGE_SHIFT); + KHOSER_STORE_PTR(elm->bitmap, bits); + } + } + + ser->preserved_mem_map = first_chunk; + + return 0; + +err_free: + kho_mem_ser_free(first_chunk); + return -ENOMEM; +} + +static void deserialize_bitmap(unsigned int order, + struct khoser_mem_bitmap_ptr *elm) +{ + struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); + unsigned long bit; + + for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { + int sz = 1 << (order + PAGE_SHIFT); + phys_addr_t phys = + elm->phys_start + (bit << (order + PAGE_SHIFT)); + struct page *page = phys_to_page(phys); + + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + page->private = order; + } +} + +static void __init kho_mem_deserialize(const void *fdt) +{ + struct khoser_mem_chunk *chunk; + const phys_addr_t *mem; + int len; + + mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved memory bitmaps\n"); + return; + } + + chunk = *mem ? phys_to_virt(*mem) : NULL; + while (chunk) { + unsigned int i; + + for (i = 0; i != chunk->hdr.num_elms; i++) + deserialize_bitmap(chunk->hdr.order, + &chunk->bitmaps[i]); + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + } +} + +/* + * With KHO enabled, memory can become fragmented because KHO regions may + * be anywhere in physical address space. The scratch regions give us a + * safe zones that we will never see KHO allocations from. This is where we + * can later safely load our new kexec images into and then use the scratch + * area for early allocations that happen before page allocator is + * initialized. + */ +static struct kho_scratch *kho_scratch; +static unsigned int kho_scratch_cnt; + +/* + * The scratch areas are scaled by default as percent of memory allocated from + * memblock. A user can override the scale with command line parameter: + * + * kho_scratch=N% + * + * It is also possible to explicitly define size for a lowmem, a global and + * per-node scratch areas: + * + * kho_scratch=l[KMG],n[KMG],m[KMG] + * + * The explicit size definition takes precedence over scale definition. + */ +static unsigned int scratch_scale __initdata = 200; +static phys_addr_t scratch_size_global __initdata; +static phys_addr_t scratch_size_pernode __initdata; +static phys_addr_t scratch_size_lowmem __initdata; + +static int __init kho_parse_scratch_size(char *p) +{ + size_t len; + unsigned long sizes[3]; + int i; + + if (!p) + return -EINVAL; + + len = strlen(p); + if (!len) + return -EINVAL; + + /* parse nn% */ + if (p[len - 1] == '%') { + /* unsigned int max is 4,294,967,295, 10 chars */ + char s_scale[11] = {}; + int ret = 0; + + if (len > ARRAY_SIZE(s_scale)) + return -EINVAL; + + memcpy(s_scale, p, len - 1); + ret = kstrtouint(s_scale, 10, &scratch_scale); + if (!ret) + pr_notice("scratch scale is %d%%\n", scratch_scale); + return ret; + } + + /* parse ll[KMG],mm[KMG],nn[KMG] */ + for (i = 0; i < ARRAY_SIZE(sizes); i++) { + char *endp = p; + + if (i > 0) { + if (*p != ',') + return -EINVAL; + p += 1; + } + + sizes[i] = memparse(p, &endp); + if (!sizes[i] || endp == p) + return -EINVAL; + p = endp; + } + + scratch_size_lowmem = sizes[0]; + scratch_size_global = sizes[1]; + scratch_size_pernode = sizes[2]; + scratch_scale = 0; + + pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", + (u64)(scratch_size_lowmem >> 20), + (u64)(scratch_size_global >> 20), + (u64)(scratch_size_pernode >> 20)); + + return 0; +} +early_param("kho_scratch", kho_parse_scratch_size); + +static void __init scratch_size_update(void) +{ + phys_addr_t size; + + if (!scratch_scale) + return; + + size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); + size = size * scratch_scale / 100; + scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); + + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + NUMA_NO_NODE); + size = size * scratch_scale / 100 - scratch_size_lowmem; + scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +static phys_addr_t __init scratch_size_node(int nid) +{ + phys_addr_t size; + + if (scratch_scale) { + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + nid); + size = size * scratch_scale / 100; + } else { + size = scratch_size_pernode; + } + + return round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +static void __init kho_reserve_scratch(void) +{ + phys_addr_t addr, size; + int nid, i = 0; + + if (!kho_enable) + return; + + scratch_size_update(); + + /* FIXME: deal with node hot-plug/remove */ + kho_scratch_cnt = num_online_nodes() + 2; + size = kho_scratch_cnt * sizeof(*kho_scratch); + kho_scratch = memblock_alloc(size, PAGE_SIZE); + if (!kho_scratch) + goto err_disable_kho; + + /* + * reserve scratch area in low memory for lowmem allocations in the + * next kernel + */ + size = scratch_size_lowmem; + addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, + ARCH_LOW_ADDRESS_LIMIT); + if (!addr) + goto err_free_scratch_desc; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + /* reserve large contiguous area for allocations without nid */ + size = scratch_size_global; + addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + for_each_online_node(nid) { + size = scratch_size_node(nid); + addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, + 0, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, true); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + } + + return; + +err_free_scratch_areas: + for (i--; i >= 0; i--) + memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); +err_free_scratch_desc: + memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); +err_disable_kho: + kho_enable = false; +} + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +/** + * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * @ser: serialization control object passed by KHO notifiers. + * @name: name of the sub tree. + * @fdt: the sub tree blob. + * + * Creates a new child node named @name in KHO root FDT and records + * the physical address of @fdt. The pages of @fdt must also be preserved + * by KHO for the new kernel to retrieve it after kexec. + * + * A debugfs blob entry is also created at + * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * + * Return: 0 on success, error code on failure + */ +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +{ + int err = 0; + u64 phys = (u64)virt_to_phys(fdt); + void *root = page_to_virt(ser->fdt); + + err |= fdt_begin_node(root, name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + + if (err) + return err; + + return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); +} +EXPORT_SYMBOL_GPL(kho_add_subtree); + +struct kho_out { + struct blocking_notifier_head chain_head; + + struct dentry *dir; + + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_serialization ser; + bool finalized; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .ser = { + .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + .track = { + .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), + }, + }, + .finalized = false, +}; + +int register_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_kho_notifier); + +int unregister_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_kho_notifier); + +/* Handling for debug/kho/out */ + +static struct dentry *debugfs_root; + +static int kho_out_update_debugfs_fdt(void) +{ + int err = 0; + struct fdt_debugfs *ff, *tmp; + + if (kho_out.finalized) { + err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, + "fdt", page_to_virt(kho_out.ser.fdt)); + } else { + list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } + } + + return err; +} + +static int kho_abort(void) +{ + int err; + unsigned long order; + struct kho_mem_phys *physxa; + + xa_for_each(&kho_out.ser.track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + xa_for_each(&physxa->phys_bits, phys, bits) + kfree(bits); + + xa_destroy(&physxa->phys_bits); + kfree(physxa); + } + xa_destroy(&kho_out.ser.track.orders); + + if (kho_out.ser.preserved_mem_map) { + kho_mem_ser_free(kho_out.ser.preserved_mem_map); + kho_out.ser.preserved_mem_map = NULL; + } + + err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, + NULL); + err = notifier_to_errno(err); + + if (err) + pr_err("Failed to abort KHO finalization: %d\n", err); + + return err; +} + +static int kho_finalize(void) +{ + int err = 0; + u64 *preserved_mem_map; + void *fdt = page_to_virt(kho_out.ser.fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + /** + * Reserve the preserved-memory-map property in the root FDT, so + * that all property definitions will precede subnodes created by + * KHO callers. + */ + err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + sizeof(*preserved_mem_map), + (void **)&preserved_mem_map); + if (err) + goto abort; + + err = kho_preserve_folio(&kho_out.ser, page_folio(kho_out.ser.fdt)); + if (err) + goto abort; + + err = blocking_notifier_call_chain(&kho_out.chain_head, + KEXEC_KHO_FINALIZE, &kho_out.ser); + err = notifier_to_errno(err); + if (err) + goto abort; + + err = kho_mem_serialize(&kho_out.ser); + if (err) + goto abort; + + *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + + err |= fdt_end_node(fdt); + err |= fdt_finish(fdt); + +abort: + if (err) { + pr_err("Failed to convert KHO state tree: %d\n", err); + kho_abort(); + } + + return err; +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + mutex_lock(&kho_out.lock); + *val = kho_out.finalized; + mutex_unlock(&kho_out.lock); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 _val) +{ + int ret = 0; + bool val = !!_val; + + mutex_lock(&kho_out.lock); + + if (val == kho_out.finalized) { + if (kho_out.finalized) + ret = -EEXIST; + else + ret = -ENOENT; + goto unlock; + } + + if (val) + ret = kho_finalize(); + else + ret = kho_abort(); + + if (ret) + goto unlock; + + kho_out.finalized = val; + ret = kho_out_update_debugfs_fdt(); + +unlock: + mutex_unlock(&kho_out.lock); + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +static __init int kho_out_debugfs_init(void) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &fops_kho_out_finalize); + if (IS_ERR(f)) + goto err_rmdir; + + kho_out.dir = dir; + kho_out.ser.sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +struct kho_in { + struct dentry *dir; + phys_addr_t fdt_phys; + phys_addr_t scratch_phys; + struct list_head fdt_list; +}; + +static struct kho_in kho_in = { + .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), +}; + +static const void *kho_get_fdt(void) +{ + return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; +} + +/** + * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. + * @name: the name of the sub FDT passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub FDT is stored in @phys. + * + * Retrieve a preserved sub FDT named @name and store its physical + * address in @phys. + * + * Return: 0 on success, error code on failure + */ +int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + const void *fdt = kho_get_fdt(); + const u64 *val; + int offset, len; + + if (!fdt) + return -ENOENT; + + if (!phys) + return -EINVAL; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) + return -ENOENT; + + val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + if (!val || len != sizeof(*val)) + return -EINVAL; + + *phys = (phys_addr_t)*val; + + return 0; +} +EXPORT_SYMBOL_GPL(kho_retrieve_subtree); + +/* Handling for debugfs/kho/in */ + +static __init int kho_in_debugfs_init(const void *fdt) +{ + struct dentry *sub_fdt_dir; + int err, child; + + kho_in.dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(kho_in.dir)) + return PTR_ERR(kho_in.dir); + + sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", + name, len); + continue; + } + err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, + err); + continue; + } + } + + return 0; + +err_rmdir: + debugfs_remove_recursive(kho_in.dir); + return err; +} + +static __init int kho_init(void) +{ + int err = 0; + const void *fdt = kho_get_fdt(); + + if (!kho_enable) + return 0; + + kho_out.ser.fdt = alloc_page(GFP_KERNEL); + if (!kho_out.ser.fdt) { + err = -ENOMEM; + goto err_free_scratch; + } + + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) { + err = -ENOENT; + goto err_free_fdt; + } + + err = kho_out_debugfs_init(); + if (err) + goto err_free_fdt; + + if (fdt) { + err = kho_in_debugfs_init(fdt); + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", + err); + + return 0; + } + + for (int i = 0; i < kho_scratch_cnt; i++) { + unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); + unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; + unsigned long pfn; + + for (pfn = base_pfn; pfn < base_pfn + count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + + return 0; + +err_free_fdt: + put_page(kho_out.ser.fdt); + kho_out.ser.fdt = NULL; +err_free_scratch: + for (int i = 0; i < kho_scratch_cnt; i++) { + void *start = __va(kho_scratch[i].addr); + void *end = start + kho_scratch[i].size; + + free_reserved_area(start, end, -1, ""); + } + kho_enable = false; + return err; +} +late_initcall(kho_init); + +static void __init kho_release_scratch(void) +{ + phys_addr_t start, end; + u64 i; + + memmap_init_kho_scratch_pages(); + + /* + * Mark scratch mem as CMA before we return it. That way we + * ensure that no kernel allocations happen on it. That means + * we can reuse it as scratch memory again later. + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { + ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); + ulong end_pfn = pageblock_align(PFN_UP(end)); + ulong pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) + set_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA); + } +} + +void __init kho_memory_init(void) +{ + if (kho_in.scratch_phys) { + kho_scratch = phys_to_virt(kho_in.scratch_phys); + kho_release_scratch(); + + kho_mem_deserialize(kho_get_fdt()); + } else { + kho_reserve_scratch(); + } +} + +void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ + void *fdt = NULL; + struct kho_scratch *scratch = NULL; + int err = 0; + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + + /* Validate the input FDT */ + fdt = early_memremap(fdt_phys, fdt_len); + if (!fdt) { + pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); + err = -EFAULT; + goto out; + } + err = fdt_check_header(fdt); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", + fdt_phys, err); + err = -EINVAL; + goto out; + } + err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", + fdt_phys, KHO_FDT_COMPATIBLE, err); + err = -EINVAL; + goto out; + } + + scratch = early_memremap(scratch_phys, scratch_len); + if (!scratch) { + pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", + scratch_phys, scratch_len); + err = -EFAULT; + goto out; + } + + /* + * We pass a safe contiguous blocks of memory to use for early boot + * purporses from the previous kernel so that we can resize the + * memblock array as needed. + */ + for (int i = 0; i < scratch_cnt; i++) { + struct kho_scratch *area = &scratch[i]; + u64 size = area->size; + + memblock_add(area->addr, size); + err = memblock_mark_kho_scratch(area->addr, size); + if (WARN_ON(err)) { + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", + &area->addr, &size, err); + goto out; + } + pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); + } + + memblock_reserve(scratch_phys, scratch_len); + + /* + * Now that we have a viable region of scratch memory, let's tell + * the memblocks allocator to only use that for any allocations. + * That way we ensure that nothing scribbles over in use data while + * we initialize the page tables which we will need to ingest all + * memory reservations from the previous kernel. + */ + memblock_set_kho_scratch_only(); + + kho_in.fdt_phys = fdt_phys; + kho_in.scratch_phys = scratch_phys; + kho_scratch_cnt = scratch_cnt; + pr_info("found kexec handover data. Will skip init for some devices\n"); + +out: + if (fdt) + early_memunmap(fdt, fdt_len); + if (scratch) + early_memunmap(scratch, scratch_len); + if (err) + pr_warn("disabling KHO revival: %d\n", err); +} + +/* Helper functions for kexec_file_load */ + +int kho_fill_kimage(struct kimage *image) +{ + ssize_t scratch_size; + int err = 0; + struct kexec_buf scratch; + + if (!kho_enable) + return 0; + + image->kho.fdt = page_to_phys(kho_out.ser.fdt); + + scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; + scratch = (struct kexec_buf){ + .image = image, + .buffer = kho_scratch, + .bufsz = scratch_size, + .mem = KEXEC_BUF_MEM_UNKNOWN, + .memsz = scratch_size, + .buf_align = SZ_64K, /* Makes it easier to map */ + .buf_max = ULONG_MAX, + .top_down = true, + }; + err = kexec_add_buffer(&scratch); + if (err) + return err; + image->kho.scratch = &image->segment[image->nr_segments - 1]; + + return 0; +} + +static int kho_walk_scratch(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + struct resource res = { + .start = kho_scratch[i].addr, + .end = kho_scratch[i].addr + kho_scratch[i].size - 1, + }; + + /* Try to fit the kimage into our KHO scratch region */ + ret = func(&res, kbuf); + if (ret) + break; + } + + return ret; +} + +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret; + + if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) + return 1; + + ret = kho_walk_scratch(kbuf, func); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index d35d9792402d1..30a733a55a67b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ + +struct kexec_buf; + +#ifdef CONFIG_KEXEC_HANDOVER +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)); +int kho_fill_kimage(struct kimage *image); +#else +static inline int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 1; +} + +static inline int kho_fill_kimage(struct kimage *image) { return 0; } +#endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d5f89f9ef29f6..75a84efad40f1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; - info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) @@ -943,10 +942,12 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); - info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; - /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + /* + * ret_data is the last non-reserved field + * in struct ptrace_syscall_info.seccomp + */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } @@ -954,7 +955,6 @@ static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { - info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) @@ -965,19 +965,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, } static int -ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, - void __user *datavp) +ptrace_get_syscall_info_op(struct task_struct *child) { - struct pt_regs *regs = task_pt_regs(child); - struct ptrace_syscall_info info = { - .op = PTRACE_SYSCALL_INFO_NONE, - .arch = syscall_get_arch(child), - .instruction_pointer = instruction_pointer(regs), - .stack_pointer = user_stack_pointer(regs), - }; - unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); - unsigned long write_size; - /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() @@ -988,24 +977,160 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: - actual_size = ptrace_get_syscall_info_entry(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_ENTRY; case PTRACE_EVENTMSG_SYSCALL_EXIT: - actual_size = ptrace_get_syscall_info_exit(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_EXIT; + default: + return PTRACE_SYSCALL_INFO_NONE; } - break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): - actual_size = ptrace_get_syscall_info_seccomp(child, regs, - &info); + return PTRACE_SYSCALL_INFO_SECCOMP; + default: + return PTRACE_SYSCALL_INFO_NONE; + } +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = ptrace_get_syscall_info_op(child), + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_SECCOMP: + actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } + +static int +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int nr = info->entry.nr; + int i; + + /* + * Check that the syscall number specified in info->entry.nr + * is either a value of type "int" or a sign-extended value + * of type "int". + */ + if (nr != info->entry.nr) + return -ERANGE; + + for (i = 0; i < ARRAY_SIZE(args); i++) { + args[i] = info->entry.args[i]; + /* + * Check that the syscall argument specified in + * info->entry.args[i] is either a value of type + * "unsigned long" or a sign-extended value of type "long". + */ + if (args[i] != info->entry.args[i]) + return -ERANGE; + } + + syscall_set_nr(child, regs, nr); + /* + * If the syscall number is set to -1, setting syscall arguments is not + * just pointless, it would also clobber the syscall return value on + * those architectures that share the same register both for the first + * argument of syscall and its return value. + */ + if (nr != -1) + syscall_set_arguments(child, regs, args); + + return 0; +} + +static int +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * info->entry is currently a subset of info->seccomp, + * info->seccomp.ret_data is currently ignored. + */ + return ptrace_set_syscall_info_entry(child, regs, info); +} + +static int +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + long rval = info->exit.rval; + + /* + * Check that the return value specified in info->exit.rval + * is either a value of type "long" or a sign-extended value + * of type "long". + */ + if (rval != info->exit.rval) + return -ERANGE; + + if (info->exit.is_error) + syscall_set_return_value(child, regs, rval, 0); + else + syscall_set_return_value(child, regs, 0, rval); + + return 0; +} + +static int +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size, + const void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info; + + if (user_size < sizeof(info)) + return -EINVAL; + + /* + * The compatibility is tracked by info.op and info.flags: if user-space + * does not instruct us to use unknown extra bits from future versions + * of ptrace_syscall_info, we are not going to read them either. + */ + if (copy_from_user(&info, datavp, sizeof(info))) + return -EFAULT; + + /* Reserved for future use. */ + if (info.flags || info.reserved) + return -EINVAL; + + /* Changing the type of the system call stop is not supported yet. */ + if (ptrace_get_syscall_info_op(child) != info.op) + return -EINVAL; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + return ptrace_set_syscall_info_entry(child, regs, &info); + case PTRACE_SYSCALL_INFO_EXIT: + return ptrace_set_syscall_info_exit(child, regs, &info); + case PTRACE_SYSCALL_INFO_SECCOMP: + return ptrace_set_syscall_info_seccomp(child, regs, &info); + default: + /* Other types of system call stops are not supported yet. */ + return -EINVAL; + } +} #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, @@ -1224,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; + + case PTRACE_SET_SYSCALL_INFO: + ret = ptrace_set_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c81cf642dba05..7f191a3b0285b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3352,6 +3352,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { + __schedstat_inc(p->stats.numa_task_swapped); + + if (p->mm) + count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1); + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -7953,8 +7958,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - /* TODO: This is not properly updating schedstats */ - + __schedstat_inc(p->stats.numa_task_migrated); + if (p->mm) + count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1); trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 56ae54e0ce6a3..f971c2af79129 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); +#ifdef CONFIG_NUMA_BALANCING + P_SCHEDSTAT(numa_task_migrated); + P_SCHEDSTAT(numa_task_swapped); +#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738c..7daf2da09e8e1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, +#endif +#ifdef __NR_uprobe + __NR_uprobe, #endif -1, /* negative terminated */ }; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c6..bf5d05c635ffd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187dc37d61d4a..0f5906f43d7ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2987,6 +2987,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; @@ -3376,6 +3379,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 1d893e3136149..25ecc1334b67d 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -422,11 +422,20 @@ static int vm_module_tags_populate(void) unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN); unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN); unsigned long more_pages; - unsigned long nr; + unsigned long nr = 0; more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; - nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN, - NUMA_NO_NODE, more_pages, next_page); + while (nr < more_pages) { + unsigned long allocated; + + allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages - nr, next_page + nr); + + if (!allocated) + break; + nr += allocated; + } + if (nr < more_pages || vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 13da529e2e724..5738ae286b41e 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -518,4 +518,5 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder, } EXPORT_SYMBOL_GPL(asn1_ber_decoder); +MODULE_DESCRIPTION("Decoder for ASN.1 BER/DER/CER encoded bytestream"); MODULE_LICENSE("GPL"); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 8c7fdb7d8c8fa..d9e19fb2dcf35 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1059,22 +1059,22 @@ static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); - struct page *page; + struct folio *folio; unsigned int ret = 0; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) { + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } - pages[ret] = find_subpage(page, xas.xa_index); - get_page(pages[ret]); + pages[ret] = folio_file_page(folio, xas.xa_index); + folio_get(folio); if (++ret == nr_pages) break; } @@ -1191,7 +1191,7 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) { - struct folio *folio = page_folio(page); + struct folio *folio = page_folio(page + k); p[k] = page + k; if (!folio_test_slab(folio)) folio_get(folio); @@ -1650,11 +1650,11 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, iov_iter_extraction_t extraction_flags, size_t *offset0) { - struct page *page, **p; + struct page **p; + struct folio *folio; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; - pgoff_t index = pos >> PAGE_SHIFT; - XA_STATE(xas, i->xarray, index); + XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT); offset = pos & ~PAGE_MASK; *offset0 = offset; @@ -1665,17 +1665,17 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, p = *pages; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) { + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } - p[nr++] = find_subpage(page, xas.xa_index); + p[nr++] = folio_file_page(folio, xas.xa_index); if (nr == maxpages) break; } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0bea23fa4bc9..affe979bd14d3 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -211,14 +211,14 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } -static void mas_set_height(struct ma_state *mas) +static void mt_set_height(struct maple_tree *mt, unsigned char height) { - unsigned int new_flags = mas->tree->ma_flags; + unsigned int new_flags = mt->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; - MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); - new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; - mas->tree->ma_flags = new_flags; + MT_BUG_ON(mt, height > MAPLE_HEIGHT_MAX); + new_flags |= height << MT_FLAGS_HEIGHT_OFFSET; + mt->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) @@ -1371,7 +1371,7 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { - mas->depth = 1; + mas->depth = 0; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; @@ -1712,9 +1712,10 @@ static inline void mas_adopt_children(struct ma_state *mas, * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. + * @new_height: if we are inserting a root node, update the height of the tree */ static inline void mas_put_in_tree(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, char new_height) __must_hold(mas->tree->ma_lock) { unsigned char offset; @@ -1723,7 +1724,7 @@ static inline void mas_put_in_tree(struct ma_state *mas, if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - mas_set_height(mas); + mt_set_height(mas->tree, new_height); } else { offset = mte_parent_slot(mas->node); @@ -1741,12 +1742,13 @@ static inline void mas_put_in_tree(struct ma_state *mas, * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. + * @new_height: The new height of the tree as a result of the operation */ static inline void mas_replace_node(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, unsigned char new_height) __must_hold(mas->tree->ma_lock) { - mas_put_in_tree(mas, old_enode); + mas_put_in_tree(mas, old_enode, new_height); mas_free(mas, old_enode); } @@ -2536,10 +2538,11 @@ static inline void mas_topiary_node(struct ma_state *mas, * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced + * @new_height: The new height of the tree as a result of the operation * */ static inline void mas_topiary_replace(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, unsigned char new_height) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); @@ -2547,7 +2550,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, int i, n; /* Place data in tree & then mark node as old */ - mas_put_in_tree(mas, old_enode); + mas_put_in_tree(mas, old_enode, new_height); /* Update the parent pointers in the tree */ tmp[0] = *mas; @@ -2631,14 +2634,15 @@ static inline void mas_topiary_replace(struct ma_state *mas, * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. + * @new_height: The new height of the tree as a result of the operation * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, - struct maple_enode *old_enode) + struct maple_enode *old_enode, unsigned char new_height) { /* Insert the new data in the tree */ - mas_topiary_replace(mas, old_enode); + mas_topiary_replace(mas, old_enode, new_height); if (mte_is_leaf(mas->node)) return; @@ -2737,7 +2741,7 @@ static inline bool mast_sufficient(struct maple_subtree_state *mast) */ static inline bool mast_overflow(struct maple_subtree_state *mast) { - if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) + if (mast->bn->b_end > mt_slot_count(mast->orig_l->node)) return true; return false; @@ -2824,6 +2828,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, { unsigned char split, mid_split; unsigned char slot = 0; + unsigned char new_height = 0; /* used if node is a new root */ struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; @@ -2845,8 +2850,6 @@ static void mas_spanning_rebalance(struct ma_state *mas, unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); - l_mas.depth = 0; - /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid @@ -2866,6 +2869,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); + new_height++; /* * Copy data from next level in the tree to mast->bn from next @@ -2873,7 +2877,6 @@ static void mas_spanning_rebalance(struct ma_state *mas, */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); - l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) @@ -2890,11 +2893,21 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; - if (mast_sufficient(mast)) - continue; + if (mast_sufficient(mast)) { + if (mast_overflow(mast)) + continue; + + if (mast->orig_l->node == mast->orig_r->node) { + /* + * The data in b_node should be stored in one + * node and in the tree + */ + slot = mast->l->offset; + break; + } - if (mast_overflow(mast)) continue; + } /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) @@ -2909,8 +2922,9 @@ static void mas_spanning_rebalance(struct ma_state *mas, l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); - l_mas.depth++; + mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); + new_height++; mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); @@ -2933,7 +2947,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; - mas_wmb_replace(mas, old_enode); + mas_wmb_replace(mas, old_enode, new_height); mtree_range_walk(mas); return; } @@ -3009,6 +3023,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); + unsigned char new_height = mas_mt_height(mas); MA_STATE(l_mas, mas->tree, mas->index, mas->last); @@ -3103,7 +3118,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end mas_ascend(mas); if (in_rcu) { - mas_replace_node(mas, old_eparent); + mas_replace_node(mas, old_eparent, new_height); mas_adopt_children(mas, mas->node); } @@ -3114,10 +3129,9 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state - * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, - struct ma_state *mas, int height) + struct ma_state *mas) { struct maple_enode *ancestor; @@ -3126,7 +3140,6 @@ static inline void mas_split_final_node(struct maple_subtree_state *mast, mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; - mas->depth = height; } /* * Only a single node is used here, could be root. @@ -3214,7 +3227,6 @@ static inline void mast_split_data(struct maple_subtree_state *mast, * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state - * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * @@ -3222,8 +3234,8 @@ static inline void mast_split_data(struct maple_subtree_state *mast, * * Return: True if pushed, false otherwise. */ -static inline bool mas_push_data(struct ma_state *mas, int height, - struct maple_subtree_state *mast, bool left) +static inline bool mas_push_data(struct ma_state *mas, + struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; @@ -3280,7 +3292,7 @@ static inline bool mas_push_data(struct ma_state *mas, int height, mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); - mas_split_final_node(mast, mas, height + 1); + mas_split_final_node(mast, mas); return true; } @@ -3293,6 +3305,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; + unsigned int orig_height = mas_mt_height(mas); unsigned char mid_split, split = 0; struct maple_enode *old; @@ -3319,7 +3332,6 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); - mas->depth = mas_mt_height(mas); mast.l = &l_mas; mast.r = &r_mas; @@ -3327,9 +3339,9 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) mast.orig_r = &prev_r_mas; mast.bn = b_node; - while (height++ <= mas->depth) { + while (height++ <= orig_height) { if (mt_slots[b_node->type] > b_node->b_end) { - mas_split_final_node(&mast, mas, height); + mas_split_final_node(&mast, mas); break; } @@ -3344,11 +3356,15 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) * is a significant savings. */ /* Try to push left. */ - if (mas_push_data(mas, height, &mast, true)) + if (mas_push_data(mas, &mast, true)) { + height++; break; + } /* Try to push right. */ - if (mas_push_data(mas, height, &mast, false)) + if (mas_push_data(mas, &mast, false)) { + height++; break; + } split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); @@ -3365,7 +3381,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; - mas_wmb_replace(mas, old); + mas_wmb_replace(mas, old, height); mtree_range_walk(mas); return; } @@ -3424,8 +3440,7 @@ static inline void mas_root_expand(struct ma_state *mas, void *entry) if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; - mas->depth = 1; - mas_set_height(mas); + mt_set_height(mas->tree, 1); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); @@ -3532,6 +3547,16 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas) if (ma_is_leaf(wr_mas->type)) return true; + if (mas->end < mt_slots[wr_mas->type] - 1) + wr_mas->vacant_height = mas->depth + 1; + + if (ma_is_root(mas_mn(mas))) { + /* root needs more than 2 entries to be sufficient + 1 */ + if (mas->end > 2) + wr_mas->sufficient_height = 1; + } else if (mas->end > mt_min_slots[wr_mas->type] + 1) + wr_mas->sufficient_height = mas->depth + 1; + mas_wr_walk_traverse(wr_mas); } @@ -3669,8 +3694,7 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { - mas->depth = 0; - mas_set_height(mas); + mt_set_height(mas->tree, 0); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; @@ -3684,8 +3708,7 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; - mas->depth = 1; - mas_set_height(mas); + mt_set_height(mas->tree, 1); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: @@ -3804,6 +3827,7 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); + unsigned char height = mas_mt_height(mas); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ @@ -3860,7 +3884,7 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); - mas_replace_node(mas, old_enode); + mas_replace_node(mas, old_enode, height); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } @@ -4059,15 +4083,6 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { - case wr_invalid: - MT_BUG_ON(mas->tree, 1); - return; - case wr_new_root: - mas_new_root(mas, wr_mas->entry); - break; - case wr_store_root: - mas_store_root(mas, wr_mas->entry); - break; case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) @@ -4089,6 +4104,14 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) case wr_rebalance: mas_wr_bnode(wr_mas); break; + case wr_new_root: + mas_new_root(mas, wr_mas->entry); + break; + case wr_store_root: + mas_store_root(mas, wr_mas->entry); + break; + case wr_invalid: + MT_BUG_ON(mas->tree, 1); } return; @@ -4140,18 +4163,41 @@ static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration - * @mas: The maple state + * @wr_mas: The maple write state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ -static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) +static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { - int ret = mas_mt_height(mas) * 3 + 1; + struct ma_state *mas = wr_mas->mas; + unsigned char height = mas_mt_height(mas); + int ret = height * 3 + 1; + unsigned char delta = height - wr_mas->vacant_height; switch (mas->store_type) { - case wr_invalid: - WARN_ON_ONCE(1); + case wr_exact_fit: + case wr_append: + case wr_slot_store: + ret = 0; + break; + case wr_spanning_store: + if (wr_mas->sufficient_height < wr_mas->vacant_height) + ret = (height - wr_mas->sufficient_height) * 3 + 1; + else + ret = delta * 3 + 1; + break; + case wr_split_store: + ret = delta * 2 + 1; + break; + case wr_rebalance: + if (wr_mas->sufficient_height < wr_mas->vacant_height) + ret = (height - wr_mas->sufficient_height) * 2 + 1; + else + ret = delta * 2 + 1; + break; + case wr_node_store: + ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_new_root: ret = 1; @@ -4164,22 +4210,8 @@ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) else ret = 0; break; - case wr_spanning_store: - ret = mas_mt_height(mas) * 3 + 1; - break; - case wr_split_store: - ret = mas_mt_height(mas) * 2 + 1; - break; - case wr_rebalance: - ret = mas_mt_height(mas) * 2 - 1; - break; - case wr_node_store: - ret = mt_in_rcu(mas->tree) ? 1 : 0; - break; - case wr_append: - case wr_exact_fit: - case wr_slot_store: - ret = 0; + case wr_invalid: + WARN_ON_ONCE(1); } return ret; @@ -4243,16 +4275,15 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { - struct ma_state *mas = wr_mas->mas; int request; mas_wr_prealloc_setup(wr_mas); - mas->store_type = mas_wr_store_type(wr_mas); - request = mas_prealloc_calc(mas, entry); + wr_mas->mas->store_type = mas_wr_store_type(wr_mas); + request = mas_prealloc_calc(wr_mas, entry); if (!request) return; - mas_node_count(mas, request); + mas_node_count(wr_mas->mas, request); } /** @@ -5397,7 +5428,7 @@ void *mas_store(struct ma_state *mas, void *entry) return wr_mas.content; } - request = mas_prealloc_calc(mas, entry); + request = mas_prealloc_calc(&wr_mas, entry); if (!request) goto store; @@ -5494,7 +5525,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); - request = mas_prealloc_calc(mas, entry); + request = mas_prealloc_calc(&wr_mas, entry); if (!request) return ret; diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 080a39d22e736..5ca0aefee9aa5 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1040,6 +1040,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) unsigned int i, id; unsigned long index; void *entry; + int ret; XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(1), limit, &next, GFP_KERNEL) != 0); @@ -1059,7 +1060,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) else entry = xa_mk_index(i - 0x3fff); XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, entry, limit, - &next, GFP_KERNEL) != (id == 1)); + &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, xa_mk_index(id) != entry); } @@ -1072,7 +1073,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) xa_limit_32b, &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, id != UINT_MAX); XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(base), - xa_limit_32b, &next, GFP_KERNEL) != 1); + xa_limit_32b, &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, id != base); XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(base + 1), xa_limit_32b, &next, GFP_KERNEL) != 0); @@ -1080,7 +1081,19 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) xa_for_each(xa, index, entry) xa_erase_index(xa, index); + XA_BUG_ON(xa, !xa_empty(xa)); + /* check wrap-around return of __xa_alloc_cyclic() */ + next = UINT_MAX; + XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(UINT_MAX), + xa_limit_32b, &next, GFP_KERNEL) != 0); + xa_lock(xa); + ret = __xa_alloc_cyclic(xa, &id, xa_mk_index(base), xa_limit_32b, + &next, GFP_KERNEL); + xa_unlock(xa); + XA_BUG_ON(xa, ret != 1); + xa_for_each(xa, index, entry) + xa_erase_index(xa, index); XA_BUG_ON(xa, !xa_empty(xa)); } diff --git a/lib/tests/slub_kunit.c b/lib/tests/slub_kunit.c index d47c472b05201..848b682a2d70e 100644 --- a/lib/tests/slub_kunit.c +++ b/lib/tests/slub_kunit.c @@ -325,4 +325,5 @@ static struct kunit_suite test_suite = { }; kunit_test_suite(test_suite); +MODULE_DESCRIPTION("Kunit tests for slub allocator"); MODULE_LICENSE("GPL"); diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index 9308bcfb2ad50..dfb4f2358cabf 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -165,4 +165,5 @@ ucs2_as_utf8(u8 *dest, const ucs2_char_t *src, unsigned long maxlength) } EXPORT_SYMBOL(ucs2_as_utf8); +MODULE_DESCRIPTION("UCS2 string handling"); MODULE_LICENSE("GPL v2"); diff --git a/lib/zlib_inflate/inflate_syms.c b/lib/zlib_inflate/inflate_syms.c index 9720114c06721..b8996d90e8bcd 100644 --- a/lib/zlib_inflate/inflate_syms.c +++ b/lib/zlib_inflate/inflate_syms.c @@ -18,4 +18,5 @@ EXPORT_SYMBOL(zlib_inflateEnd); EXPORT_SYMBOL(zlib_inflateReset); EXPORT_SYMBOL(zlib_inflateIncomp); EXPORT_SYMBOL(zlib_inflate_blob); +MODULE_DESCRIPTION("Data decompression using the deflation algorithm"); MODULE_LICENSE("GPL"); diff --git a/mm/Kconfig b/mm/Kconfig index e113f713b4938..60ea9eba48140 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -469,6 +469,10 @@ config HAVE_GUP_FAST depends on MMU bool +# Enable memblock support for scratch memory which is needed for kexec handover +config MEMBLOCK_KHO_SCRATCH + bool + # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. diff --git a/mm/cma.c b/mm/cma.c index b06d5fe73399f..15632939f20a1 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -35,7 +35,7 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; -static int __init __cma_declare_contiguous_nid(phys_addr_t base, +static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, const char *name, struct cma **res_cma, @@ -370,7 +370,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, phys_addr_t align, unsigned int order_per_bit, const char *name, struct cma **res_cma, int nid) { - phys_addr_t start, end; + phys_addr_t start = 0, end; phys_addr_t size, sizesum, sizeleft; struct cma_init_memrange *mrp, *mlp, *failed; struct cma_memrange *cmrp; @@ -384,7 +384,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, /* * First, try it the normal way, producing just one range. */ - ret = __cma_declare_contiguous_nid(0, total_size, 0, align, + ret = __cma_declare_contiguous_nid(&start, total_size, 0, align, order_per_bit, false, name, res_cma, nid); if (ret != -ENOMEM) goto out; @@ -580,7 +580,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, { int ret; - ret = __cma_declare_contiguous_nid(base, size, limit, alignment, + ret = __cma_declare_contiguous_nid(&base, size, limit, alignment, order_per_bit, fixed, name, res_cma, nid); if (ret != 0) pr_err("Failed to reserve %ld MiB\n", @@ -592,14 +592,14 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, return ret; } -static int __init __cma_declare_contiguous_nid(phys_addr_t base, +static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, const char *name, struct cma **res_cma, int nid) { phys_addr_t memblock_end = memblock_end_of_DRAM(); - phys_addr_t highmem_start; + phys_addr_t highmem_start, base = *basep; int ret; /* @@ -722,12 +722,15 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t base, } ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); - if (ret) + if (ret) { memblock_phys_free(base, size); + return ret; + } (*res_cma)->nid = nid; + *basep = base; - return ret; + return 0; } static void cma_debug_show_areas(struct cma *cma) diff --git a/mm/compaction.c b/mm/compaction.c index 139f00c0308a3..3925cb61dbb8f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -981,13 +981,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } if (PageHuge(page)) { + const unsigned int order = compound_order(page); /* * skip hugetlbfs if we are not compacting for pages * bigger than its order. THPs and other compound pages * are handled below. */ if (!cc->alloc_contig) { - const unsigned int order = compound_order(page); if (order <= MAX_PAGE_ORDER) { low_pfn += (1UL << order) - 1; @@ -1001,27 +1001,27 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + folio = page_folio(page); + ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages); /* - * Fail isolation in case isolate_or_dissolve_huge_page() + * Fail isolation in case isolate_or_dissolve_huge_folio() * reports an error. In case of -ENOMEM, abort right away. */ if (ret < 0) { /* Do not report -EBUSY down the chain */ if (ret == -EBUSY) ret = 0; - low_pfn += compound_nr(page) - 1; - nr_scanned += compound_nr(page) - 1; + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; goto isolate_fail; } - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - folio = page_folio(page); low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } @@ -2249,15 +2249,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) static unsigned int fragmentation_score_wmark(bool low) { - unsigned int wmark_low; + unsigned int wmark_low, leeway; - /* - * Cap the low watermark to avoid excessive compaction - * activity in case a user sets the proactiveness tunable - * close to 100 (maximum). - */ - wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); - return low ? wmark_low : min(wmark_low + 10, 100U); + wmark_low = 100U - sysctl_compaction_proactiveness; + leeway = min(10U, wmark_low / 2); + return low ? wmark_low : min(wmark_low + leeway, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) @@ -2348,7 +2344,6 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2364,8 +2359,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, - true, &claim_block) != -1) + if (find_suitable_fallback(area, order, migratetype, true) >= 0) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/debug.c b/mm/debug.c index db83e381a8ae3..907382257062c 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); - int mapcount = atomic_read(&page->_mapcount); + int mapcount = atomic_read(&page->_mapcount) + 1; char *type = ""; - mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bc748f700a9e1..7731b238b5340 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -910,26 +910,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) #ifdef CONFIG_HUGETLB_PAGE static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { - struct page *page; pte_t pte; pr_debug("Validating HugeTLB basic\n"); - /* - * Accessing the page associated with the pfn is safe here, - * as it was previously derived from a real kernel symbol. - */ - page = pfn_to_page(args->fixed_pmd_pfn); - pte = mk_huge_pte(page, args->page_prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); + pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS); +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB + WARN_ON(!pte_huge(pte)); +#endif WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); - -#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); - - WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS))); -#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } diff --git a/mm/filemap.c b/mm/filemap.c index b5e784f34d980..7b90cbeb4a1ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2244,6 +2244,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, *start = folio->index + nr; goto out; } + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); diff --git a/mm/gup.c b/mm/gup.c index 92351e2fa876b..f321683393901 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -844,11 +844,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, pte_t *ptep, pte; int ret; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return ERR_PTR(-EINVAL); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); @@ -1432,7 +1427,11 @@ static long __get_user_pages(struct mm_struct *mm, start = untagged_addr_remote(mm, start); - VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET)); do { struct page *page; @@ -2114,28 +2113,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, */ size_t fault_in_writeable(char __user *uaddr, size_t size) { - char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_put_user(0, uaddr, out); - uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_put_user(0, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_put_user(0, (char __user *)cur, out); out: user_write_access_end(); - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); @@ -2189,26 +2182,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)uaddr, end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; - end = PAGE_ALIGN(start + size); - if (end < start) - end = 0; mmap_read_lock(mm); - do { - if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked)) break; - start = (start + PAGE_SIZE) & PAGE_MASK; - } while (start != end); mmap_read_unlock(mm); - if (size > (unsigned long)uaddr - start) - return size - ((unsigned long)uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); @@ -2223,30 +2214,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable); */ size_t fault_in_readable(const char __user *uaddr, size_t size) { - const char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_get_user(c, uaddr, out); - uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_get_user(c, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_get_user(c, (const char __user *)cur, out); out: user_read_access_end(); (void)c; - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_readable); @@ -3173,46 +3158,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } -static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - int refs; - struct page *page; - struct folio *folio; - - if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) - return 0; - - BUILD_BUG_ON(pgd_devmap(orig)); - - page = pgd_page(orig); - refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - - folio = try_grab_folio_fast(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!gup_fast_folio_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} - static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3307,12 +3252,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_leaf(pgd))) { - if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, - pages, nr)) - return; - } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, - pages, nr)) + BUILD_BUG_ON(pgd_leaf(pgd)); + if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a47682d1ab77..e97a975864783 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1203,7 +1203,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, { pmd_t entry; - entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); @@ -1309,8 +1309,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct folio *zero_folio) { pmd_t entry; - entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); + entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -2260,6 +2259,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); + + /* + * Use flush_needed to indicate whether the PMD entry + * is present, instead of checking pmd_present() again. + */ + if (flush_needed && pmd_young(orig_pmd) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); } spin_unlock(ptl); @@ -2653,12 +2660,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); } else { src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); - _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); } set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); @@ -4675,7 +4682,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); folio_get(folio); - pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); + pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39f92aad7bd1e..e287d8050b407 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1950,7 +1950,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; - bool retry = true; /* * By default we always try hard to allocate the folio with @@ -1965,22 +1964,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); -retry: - folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Ensure hugetlb folio won't have large_rmappable flag set. */ - if (folio) - folio_clear_large_rmappable(folio); - if (folio && !folio_ref_freeze(folio, 1)) { - folio_put(folio); - if (retry) { /* retry once */ - retry = false; - goto retry; - } - /* WOW! twice in a row. */ - pr_warn("HugeTLB unexpected inflated folio ref count\n"); - folio = NULL; - } + folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a @@ -2271,7 +2256,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ - __prep_account_new_huge_page(h, nid); + __prep_account_new_huge_page(h, folio_nid(folio)); /* * We could have raced with the pool size change. @@ -2419,7 +2404,6 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; - int node; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); @@ -2443,21 +2427,12 @@ static int gather_surplus_pages(struct hstate *h, long delta) for (i = 0; i < needed; i++) { folio = NULL; - /* Prioritize current node */ - if (node_isset(numa_mem_id(), alloc_nodemask)) - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - numa_mem_id(), NULL); - - if (!folio) { - for_each_node_mask(node, alloc_nodemask) { - if (node == numa_mem_id()) - continue; - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - node, NULL); - if (folio) - break; - } - } + /* + * It is okay to use NUMA_NO_NODE because we use numa_mem_id() + * down the road to pick the current node if that is the case. + */ + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + NUMA_NO_NODE, &alloc_nodemask); if (!folio) { alloc_ok = false; break; @@ -2896,10 +2871,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, return ret; } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { struct hstate *h; - struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -3010,7 +2984,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long retval, gbl_chg; + long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3137,16 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); @@ -3825,6 +3807,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { + unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; @@ -3959,8 +3942,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. + * + * min_count is the expected number of persistent pages, we + * shouldn't calculate min_count by using + * resv_huge_pages + persistent_huge_pages() - free_huge_pages, + * because there may exist free surplus huge pages, and this will + * lead to subtracting twice. Free surplus huge pages come from HVO + * failing to restore vmemmap, see comments in the callers of + * hugetlb_vmemmap_restore_folio(). Thus, we should calculate + * persistent free count first. */ - min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + persistent_free_count = h->free_huge_pages; + if (h->free_huge_pages > persistent_huge_pages(h)) { + if (h->free_huge_pages > h->surplus_huge_pages) + persistent_free_count -= h->surplus_huge_pages; + else + persistent_free_count = 0; + } + min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); @@ -4630,7 +4629,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s\n", h->name); } #ifdef CONFIG_NUMA @@ -5441,18 +5440,16 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, bool try_mkwrite) { - pte_t entry; + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { - entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, - vma->vm_page_prot))); + entry = pte_mkwrite_novma(pte_mkdirty(entry)); } else { - entry = huge_pte_wrprotect(mk_huge_pte(page, - vma->vm_page_prot)); + entry = pte_wrprotect(entry); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); @@ -5507,7 +5504,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, true); + pte_t newpte = make_huge_pte(vma, new_folio, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -6257,7 +6254,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); + pte_t newpte = make_huge_pte(vma, new_folio, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); @@ -6537,7 +6534,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); + new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -7022,7 +7019,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - _dst_pte = make_huge_pte(dst_vma, &folio->page, + _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be @@ -7216,7 +7213,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7351,8 +7348,16 @@ bool hugetlb_reserve_pages(struct inode *inode, return true; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9a99dfa3c4958..27245e86df250 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 3 (one - * head struct page struct and two tail struct page structs) struct page + * of "corrupted mapping in tail page". We need to reset at least 4 (one + * head struct page struct and three tail struct page structs) struct page * structs. */ -#define NR_RESET_STRUCT_PAGE 3 +#define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { diff --git a/mm/internal.h b/mm/internal.h index 50c2f590b2d04..838f840ded83d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -435,6 +435,9 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long size, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); @@ -915,7 +918,7 @@ static inline void init_cma_pageblock(struct page *page) int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claim_only, bool *claim_block); + int migratetype, bool claimable); static inline bool free_area_empty(struct free_area *area, int migratetype) { @@ -1121,6 +1124,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void init_deferred_page(unsigned long pfn, int nid); + enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, @@ -1595,6 +1600,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); +void unaccepted_cleanup_work(struct work_struct *work); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 3ea317837c2d3..f24e3bef72a44 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -2127,4 +2127,5 @@ static struct kunit_suite kasan_kunit_test_suite = { kunit_test_suite(kasan_kunit_test_suite); +MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities"); MODULE_LICENSE("GPL"); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cc945c6ab3bdb..b8838ba8207ae 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1239,7 +1239,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); + _pmd = folio_mk_pmd(folio, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); diff --git a/mm/madvise.c b/mm/madvise.c index b17f684322ad7..8433ac9b27e09 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,6 +48,11 @@ struct madvise_walk_private { bool pageout; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -794,12 +799,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -815,17 +821,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); + tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -848,7 +851,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct zap_details details = { @@ -856,7 +860,8 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, .even_cows = true, }; - zap_page_range_single(vma, start, end - start, &details); + zap_page_range_single_batched( + madv_behavior->tlb, vma, start, end - start, &details); return 0; } @@ -893,8 +898,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -946,9 +952,10 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma( + madv_behavior, vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } @@ -1249,8 +1256,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1270,7 +1279,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1487,10 +1496,10 @@ static bool process_madvise_remote_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1548,7 +1557,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1557,7 +1566,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1589,7 +1598,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ @@ -1619,6 +1628,31 @@ static void madvise_unlock(struct mm_struct *mm, int behavior) mmap_read_unlock(mm); } +static bool madvise_batch_tlb_flush(int behavior) +{ + switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return true; + default: + return false; + } +} + +static void madvise_init_tlb(struct madvise_behavior *madv_behavior, + struct mm_struct *mm) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, mm); +} + +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); +} + static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) { size_t len; @@ -1677,8 +1711,10 @@ static bool is_madvise_populate(int behavior) } static int madvise_do_behavior(struct mm_struct *mm, - unsigned long start, size_t len_in, int behavior) + unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct blk_plug plug; unsigned long end; int error; @@ -1692,7 +1728,7 @@ static int madvise_do_behavior(struct mm_struct *mm, if (is_madvise_populate(behavior)) error = madvise_populate(mm, start, end, behavior); else - error = madvise_walk_vmas(mm, start, end, behavior, + error = madvise_walk_vmas(mm, start, end, madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); return error; @@ -1773,13 +1809,20 @@ static int madvise_do_behavior(struct mm_struct *mm, int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { int error; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; error = madvise_lock(mm, behavior); if (error) return error; - error = madvise_do_behavior(mm, start, len_in, behavior); + madvise_init_tlb(&madv_behavior, mm); + error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); return error; @@ -1796,12 +1839,18 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; total_len = iov_iter_count(iter); ret = madvise_lock(mm, behavior); if (ret) return ret; + madvise_init_tlb(&madv_behavior, mm); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); @@ -1811,7 +1860,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else - ret = madvise_do_behavior(mm, start, len_in, behavior); + ret = madvise_do_behavior(mm, start, len_in, + &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat @@ -1829,14 +1879,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, } /* Drop and reacquire lock to unwind race. */ + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); madvise_lock(mm, behavior); + madvise_init_tlb(&madv_behavior, mm); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); ret = (total_len - iov_iter_count(iter)) ? : ret; diff --git a/mm/memblock.c b/mm/memblock.c index 0a53db4d9f7be..3571a859f2fe1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -18,6 +18,11 @@ #include #include +#ifdef CONFIG_KEXEC_HANDOVER +#include +#include +#endif /* CONFIG_KEXEC_HANDOVER */ + #include #include @@ -107,6 +112,13 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */ +static bool kho_scratch_only; +#else +#define kho_scratch_only false +#endif + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -166,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void) static enum memblock_flags __init_memblock choose_memblock_flags(void) { + /* skip non-scratch memory for kho early boot allocations */ + if (kho_scratch_only) + return MEMBLOCK_KHO_SCRATCH; + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -492,7 +508,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, * needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_alloc_size)); + BUG_ON(memblock_reserve_kern(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; @@ -642,7 +658,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type, #ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif - WARN_ON(flags != rgn->flags); + WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags); nr_new++; if (insert) { if (start_rgn == -1) @@ -902,14 +918,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, - &base, &end, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -924,6 +941,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +__init_memblock void memblock_set_kho_scratch_only(void) +{ + kho_scratch_only = true; +} + +__init_memblock void memblock_clear_kho_scratch_only(void) +{ + kho_scratch_only = false; +} + +void __init_memblock memmap_init_kho_scratch_pages(void) +{ + phys_addr_t start, end; + unsigned long pfn; + int nid; + u64 i; + + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return; + + /* + * Initialize struct pages for free scratch memory. + * The struct pages for reserved scratch memory will be set up in + * reserve_bootmem_region() + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { + for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++) + init_deferred_page(pfn, nid); + } +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region * @type: memblock type to set/clear flag for @@ -1049,6 +1100,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t MEMBLOCK_RSRV_NOINIT); } +/** + * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered + * for allocations during early boot with kexec handover. + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 1, + MEMBLOCK_KHO_SCRATCH); +} + +/** + * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a + * specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 0, + MEMBLOCK_KHO_SCRATCH); +} + static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) @@ -1080,6 +1161,13 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; + /* + * In early alloc during kexec handover, we can only consider + * MEMBLOCK_KHO_SCRATCH regions for the allocations + */ + if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m)) + return true; + return false; } @@ -1460,14 +1548,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) + if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN)) goto done; if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve_kern(found, size)) goto done; } @@ -1752,6 +1840,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } +phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid) +{ + struct memblock_region *r; + phys_addr_t total = 0; + + for_each_reserved_mem_region(r) { + phys_addr_t size = r->size; + + if (r->base > limit) + break; + + if (r->base + r->size > limit) + size = limit - r->base; + + if (nid == memblock_get_region_node(r) || !numa_valid_node(nid)) + if (r->flags & MEMBLOCK_RSRV_KERN) + total += size; + } + + return total; +} + /** * memblock_estimated_nr_free_pages - return estimated number of free pages * from memblock point of view @@ -2272,6 +2382,7 @@ void __init memblock_free_all(void) free_unused_memmap(); reset_all_zones_managed_pages(); + memblock_clear_kho_scratch_only(); pages = free_low_memory_core_early(); totalram_pages_add(pages); } @@ -2369,6 +2480,201 @@ int reserve_mem_release_by_name(const char *name) return 1; } +#ifdef CONFIG_KEXEC_HANDOVER +#define MEMBLOCK_KHO_FDT "memblock" +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" +static struct page *kho_fdt; + +static int reserve_mem_kho_finalize(struct kho_serialization *ser) +{ + int err = 0, i; + + if (!reserved_mem_count) + return NOTIFY_DONE; + + if (IS_ERR(kho_fdt)) { + err = PTR_ERR(kho_fdt); + pr_err("memblock FDT was not prepared successfully: %d\n", err); + return notifier_from_errno(err); + } + + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= kho_preserve_phys(ser, map->start, map->size); + } + + err |= kho_preserve_folio(ser, page_folio(kho_fdt)); + err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); + + return notifier_from_errno(err); +} + +static int reserve_mem_kho_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + switch (cmd) { + case KEXEC_KHO_FINALIZE: + return reserve_mem_kho_finalize((struct kho_serialization *)v); + case KEXEC_KHO_ABORT: + return NOTIFY_DONE; + default: + return NOTIFY_BAD; + } +} + +static struct notifier_block reserve_mem_kho_nb = { + .notifier_call = reserve_mem_kho_notifier, +}; + +static void __init prepare_kho_fdt(void) +{ + int err = 0, i; + void *fdt; + + if (!reserved_mem_count) + return; + + kho_fdt = alloc_page(GFP_KERNEL); + if (!kho_fdt) { + kho_fdt = ERR_PTR(-ENOMEM); + return; + } + + fdt = page_to_virt(kho_fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= fdt_begin_node(fdt, map->name); + err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); + err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); + err |= fdt_property(fdt, "size", &map->size, sizeof(map->size)); + err |= fdt_end_node(fdt); + } + err |= fdt_end_node(fdt); + + err |= fdt_finish(fdt); + + if (err) { + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); + put_page(kho_fdt); + kho_fdt = ERR_PTR(-EINVAL); + } +} + +static int __init reserve_mem_init(void) +{ + if (!kho_is_enabled()) + return 0; + + prepare_kho_fdt(); + + return register_kho_notifier(&reserve_mem_kho_nb); +} +late_initcall(reserve_mem_init); + +static void *kho_fdt_in __initdata; + +static void *__init reserve_mem_kho_retrieve_fdt(void) +{ + phys_addr_t fdt_phys; + struct folio *fdt_folio; + void *fdt; + int err; + + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + if (err) { + if (err != -ENOENT) + pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", + MEMBLOCK_KHO_FDT, err); + return ERR_PTR(err); + } + + fdt_folio = kho_restore_folio(fdt_phys); + if (!fdt_folio) { + pr_warn("failed to restore memblock KHO FDT (0x%llx)\n", fdt_phys); + return ERR_PTR(-EFAULT); + } + + fdt = page_to_virt(folio_page(fdt_folio, 0)); + + err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("FDT '%s' is incompatible with '%s': %d\n", + MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err); + return ERR_PTR(-EINVAL); + } + + return fdt; +} + +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + int err, len_start, len_size, offset; + const phys_addr_t *p_start, *p_size; + const void *fdt; + + if (!kho_fdt_in) + kho_fdt_in = reserve_mem_kho_retrieve_fdt(); + + if (IS_ERR(kho_fdt_in)) + return false; + + fdt = kho_fdt_in; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) { + pr_warn("FDT '%s' has no child '%s': %d\n", + MEMBLOCK_KHO_FDT, name, offset); + return false; + } + err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("Node '%s' is incompatible with '%s': %d\n", + name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err); + return false; + } + + p_start = fdt_getprop(fdt, offset, "start", &len_start); + p_size = fdt_getprop(fdt, offset, "size", &len_size); + if (!p_start || len_start != sizeof(*p_start) || !p_size || + len_size != sizeof(*p_size)) { + return false; + } + + if (*p_start & (align - 1)) { + pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n", + name, (long)align, (long)*p_start); + return false; + } + + if (*p_size != size) { + pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n", + name, (long)*p_size, (long)size); + return false; + } + + reserved_mem_add(*p_start, size, name); + pr_info("Revived memory reservation '%s' from KHO\n", name); + + return true; +} +#else +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER */ + /* * Parse reserve_mem=nn:align:name */ @@ -2424,6 +2730,11 @@ static int __init reserve_mem(char *p) if (reserve_mem_find_by_name(name, &start, &tmp)) return -EBUSY; + /* Pick previous allocations up from KHO if available */ + if (reserve_mem_kho_revive(name, size, align)) + return 1; + + /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); if (!start) return -ENOMEM; @@ -2441,6 +2752,8 @@ static const char * const flagname[] = { [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", + [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN", + [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH", }; static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 8660908850dc8..4a9cf27a70af0 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 421740f1bcdc6..dedb74d399a97 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -129,8 +129,7 @@ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } -static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -163,8 +162,16 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + memcg1_account_kmem(memcg, -nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); + mem_cgroup_put(memcg); + } spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); @@ -463,6 +470,8 @@ static const unsigned int memcg_vm_event_stat[] = { NUMA_PAGE_MIGRATE, NUMA_PTE_UPDATES, NUMA_HINT_FAULTS, + NUMA_TASK_MIGRATE, + NUMA_TASK_SWAP, #endif }; @@ -585,18 +594,20 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) cgroup_rstat_updated(memcg->css.cgroup, cpu); statc = this_cpu_ptr(memcg->vmstats_percpu); for (; statc; statc = statc->parent) { + /* + * If @memcg is already flushable then all its ancestors are + * flushable as well and also there is no need to increase + * stats_updates. + */ + if (memcg_vmstats_needs_flush(statc->vmstats)) + break; + stats_updates = READ_ONCE(statc->stats_updates) + abs(val); WRITE_ONCE(statc->stats_updates, stats_updates); if (stats_updates < MEMCG_CHARGE_BATCH) continue; - /* - * If @memcg is already flush-able, increasing stats_updates is - * redundant. Avoid the overhead of the atomic update. - */ - if (!memcg_vmstats_needs_flush(statc->vmstats)) - atomic64_add(stats_updates, - &statc->vmstats->stats_updates); + atomic64_add(stats_updates, &statc->vmstats->stats_updates); WRITE_ONCE(statc->stats_updates, 0); } } @@ -703,10 +714,12 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val); trace_mod_memcg_state(memcg, idx, val); + memcg_stats_unlock(); } #ifdef CONFIG_MEMCG_V1 @@ -1664,7 +1677,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = task_is_dying() || out_of_memory(&oc); + ret = out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -1759,7 +1772,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } struct memcg_stock_pcp { - localtry_lock_t stock_lock; + local_trylock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1774,11 +1787,11 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE 0 }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCALTRY_LOCK(stock_lock), + .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock), }; static DEFINE_MUTEX(percpu_charge_mutex); -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); +static void drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); @@ -1805,11 +1818,10 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { - if (!gfpflags_allow_spinning(gfp_mask)) - return ret; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); - } + if (gfpflags_allow_spinning(gfp_mask)) + local_lock_irqsave(&memcg_stock.stock_lock, flags); + else if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) + return ret; stock = this_cpu_ptr(&memcg_stock); stock_pages = READ_ONCE(stock->nr_pages); @@ -1818,11 +1830,18 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, ret = true; } - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + /* * Returns stocks cached in percpu and reset cached information. */ @@ -1835,10 +1854,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) return; if (stock_pages) { - page_counter_uncharge(&old->memory, stock_pages); - if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock_pages); - + memcg_uncharge(old, stock_pages); WRITE_ONCE(stock->nr_pages, 0); } @@ -1849,7 +1865,6 @@ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; /* @@ -1857,25 +1872,32 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - old = drain_obj_stock(stock); + drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } -/* - * Cache charges(val) to local per_cpu area. - * This will be consumed by consume_stock() function, later. - */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; unsigned int stock_pages; + unsigned long flags; + + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + + if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * In case of unlikely failure to lock percpu stock_lock + * uncharge memcg directly. + */ + memcg_uncharge(memcg, nr_pages); + return; + } stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ @@ -1888,26 +1910,8 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); -} -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - unsigned long flags; - - if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { - /* - * In case of unlikely failure to lock percpu stock_lock - * uncharge memcg directly. - */ - if (mem_cgroup_is_root(memcg)) - return; - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - return; - } - __refill_stock(memcg, nr_pages); - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -1958,18 +1962,16 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old; unsigned long flags; stock = &per_cpu(memcg_stock, cpu); /* drain_obj_stock requires stock_lock */ - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); - old = drain_obj_stock(stock); - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); + drain_obj_stock(stock); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); drain_stock(stock); - obj_cgroup_put(old); return 0; } @@ -2765,40 +2767,17 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -/* Replace the stock objcg with objcg, return the old objcg */ -static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) -{ - struct obj_cgroup *old = NULL; - - old = drain_obj_stock(stock); - obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) - ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); - return old; -} - -static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static void __account_obj_stock(struct obj_cgroup *objcg, + struct memcg_stock_pcp *stock, int nr, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; int *bytes; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat or idx - * changes. + * accumulating over a page of vmstat data or when pgdat changes. */ - if (READ_ONCE(stock->cached_objcg) != objcg) { - old = replace_stock_objcg(stock, objcg); - stock->cached_pgdat = pgdat; - } else if (stock->cached_pgdat != pgdat) { + if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ struct pglist_data *oldpg = stock->cached_pgdat; @@ -2835,36 +2814,37 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } if (nr) __mod_objcg_mlstate(objcg, pgdat, idx, nr); - - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + struct pglist_data *pgdat, enum node_stat_item idx) { struct memcg_stock_pcp *stock; unsigned long flags; bool ret = false; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; + + if (pgdat) + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) - return NULL; + return; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -2875,9 +2855,10 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) memcg = get_mem_cgroup_from_objcg(old); - mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + __mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - __refill_stock(memcg, nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); css_put(&memcg->css); } @@ -2916,11 +2897,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) } WRITE_ONCE(stock->cached_objcg, NULL); - /* - * The `old' objects needs to be released by the caller via - * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. - */ - return old; + obj_cgroup_put(old); } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -2939,40 +2916,48 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge) + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, + enum node_stat_item idx) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; unsigned int nr_pages = 0; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - old = replace_stock_objcg(stock, objcg); + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; + if (pgdat) + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); } - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, + struct pglist_data *pgdat, enum node_stat_item idx) { unsigned int nr_pages, nr_bytes; int ret; - if (consume_obj_stock(objcg, size)) + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) return 0; /* @@ -3005,15 +2990,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) nr_pages += 1; ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + if (!ret && (nr_bytes || pgdat)) + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, + false, size, pgdat, idx); return ret; } +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); +} + void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true); + refill_obj_stock(objcg, size, true, 0, NULL, 0); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3065,23 +3056,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; } - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) - return false; - for (i = 0; i < size; i++) { slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } + /* + * if we fail and size is 1, memcg_alloc_abort_single() will + * just free the object, which is ok as we have not assigned + * objcg to its obj_ext yet + * + * for larger sizes, kmem_cache_free_bulk() will uncharge + * any objects that were already charged and obj_ext assigned + * + * TODO: we could batch this until slab_pgdat(slab) changes + * between iterations, with a more complicated undo + */ + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), + slab_pgdat(slab), cache_vmstat_idx(s))) + return false; + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); } return true; @@ -3090,6 +3090,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { + size_t obj_size = obj_full_size(s); + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -3100,9 +3102,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); + refill_obj_stock(objcg, obj_size, true, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); } } @@ -4698,9 +4699,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { if (ug->nr_memory) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); - if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + memcg_uncharge(ug->memcg, ug->nr_memory); if (ug->nr_kmem) { mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); memcg1_account_kmem(ug->memcg, -ug->nr_kmem); diff --git a/mm/memory.c b/mm/memory.c index 2d8c265fc7d60..037b6ce211f1f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -278,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -929,7 +938,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -1361,7 +1370,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; - unsigned long next, pfn; + unsigned long next, pfn = 0; bool is_cow; int ret; @@ -1990,35 +1999,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; + + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -2938,11 +2976,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -3523,7 +3561,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3734,8 +3772,6 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); - VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio)); - VM_WARN_ON_ONCE(folio_entire_mapcount(folio)); if (unlikely(folio_test_swapcache(folio))) { /* @@ -3760,6 +3796,8 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); @@ -5013,7 +5051,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -5188,7 +5226,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -5245,6 +5283,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 84f14fa12d0dd..4918209735b42 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "internal.h" #include "slab.h" @@ -743,7 +744,7 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_deferred_page(unsigned long pfn, int nid) +static void __meminit __init_deferred_page(unsigned long pfn, int nid) { if (early_page_initialised(pfn, nid)) return; @@ -763,11 +764,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_deferred_page(unsigned long pfn, int nid) +static inline void __init_deferred_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void __meminit init_deferred_page(unsigned long pfn, int nid) +{ + __init_deferred_page(pfn, nid); +} + /* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and @@ -784,7 +790,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, if (pfn_valid(start_pfn)) { struct page *page = pfn_to_page(start_pfn); - init_deferred_page(start_pfn, nid); + __init_deferred_page(start_pfn, nid); /* * no need for atomic set_bit because the struct @@ -1441,6 +1447,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); + INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work); #endif } @@ -2765,6 +2772,13 @@ void __init mm_core_init(void) report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + + /* + * KHO memory setup must happen while memblock is still active, but + * as close as possible to buddy initialization + */ + kho_memory_init(); + memblock_free_all(); mem_init(); kmem_cache_init(); diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index ff4054f4334da..541a99c4071a6 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -200,6 +200,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } +/** + * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the numa_reserved_meminfo. + * + * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances + * against memblock_type information and moves any that intersect reserved + * ranges to numa_reserved_meminfo. However, when that information is known + * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk + * to numa_reserved_meminfo directly. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo); +} + /** * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c81624bc39697..20e1d76f1eba3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -520,8 +520,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd6b865cb1abf..bbfd8e4ce0dfe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -933,19 +933,13 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void free_page_is_bad_report(struct page *page) -{ - bad_page(page, - page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); -} - static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ - free_page_is_bad_report(page); + bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); return true; } @@ -1400,11 +1394,12 @@ static void free_one_page(struct zone *zone, struct page *page, struct llist_head *llhead; unsigned long flags; - if (!spin_trylock_irqsave(&zone->lock, flags)) { - if (unlikely(fpi_flags & FPI_TRYLOCK)) { + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) { add_page_to_zone_llist(zone, page, order); return; } + } else { spin_lock_irqsave(&zone->lock, flags); } @@ -2077,31 +2072,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt) /* * Check whether there is a suitable fallback freepage with requested order. - * Sets *claim_block to instruct the caller whether it should convert a whole - * pageblock to the returned migratetype. - * If only_claim is true, this function returns fallback_mt only if + * If claimable is true, this function returns fallback_mt only if * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_claim, bool *claim_block) + int migratetype, bool claimable) { int i; - int fallback_mt; + + if (claimable && !should_try_claim_block(order, migratetype)) + return -2; if (area->nr_free == 0) return -1; - *claim_block = false; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; - - if (should_try_claim_block(order, migratetype)) - *claim_block = true; + int fallback_mt = fallbacks[migratetype][i]; - if (*claim_block || !only_claim) + if (!free_area_empty(area, fallback_mt)) return fallback_mt; } @@ -2182,23 +2171,15 @@ try_to_claim_block(struct zone *zone, struct page *page, } /* - * Try finding a free buddy page on the fallback list. - * - * This will attempt to claim a whole pageblock for the requested type - * to ensure grouping of such requests in the future. - * - * If a whole block cannot be claimed, steal an individual page, regressing to - * __rmqueue_smallest() logic to at least break up as little contiguity as - * possible. + * Try to allocate from some fallback migratetype by claiming the entire block, + * i.e. converting it to the allocation's start migratetype. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. - * - * Return the stolen page, or NULL if none can be found. */ static __always_inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, +__rmqueue_claim(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { struct free_area *area; @@ -2206,7 +2187,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2225,53 +2205,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, true); + + /* No block in that order */ if (fallback_mt == -1) continue; - if (!claim_block) + /* Advanced into orders too low to claim, abort */ + if (fallback_mt == -2) break; page = get_page_from_free_area(area, fallback_mt); page = try_to_claim_block(zone, page, current_order, order, start_migratetype, fallback_mt, alloc_flags); - if (page) - goto got_one; + if (page) { + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } } - if (alloc_flags & ALLOC_NOFRAGMENT) - return NULL; + return NULL; +} + +/* + * Try to steal a single page from some fallback migratetype. Leave the rest of + * the block as its current migratetype, potentially causing fragmentation. + */ +static __always_inline struct page * +__rmqueue_steal(struct zone *zone, int order, int start_migratetype) +{ + struct free_area *area; + int current_order; + struct page *page; + int fallback_mt; - /* No luck claiming pageblock. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, false); if (fallback_mt == -1) continue; page = get_page_from_free_area(area, fallback_mt); page_del_and_expand(zone, page, order, current_order, fallback_mt); - goto got_one; + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; } return NULL; - -got_one: - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); - - return page; } +enum rmqueue_mode { + RMQUEUE_NORMAL, + RMQUEUE_CMA, + RMQUEUE_CLAIM, + RMQUEUE_STEAL, +}; + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, - unsigned int alloc_flags) + unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page; @@ -2290,16 +2290,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } } - page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (alloc_flags & ALLOC_CMA) + /* + * First try the freelists of the requested migratetype, then try + * fallbacks modes with increasing levels of fragmentation risk. + * + * The fallback logic is expensive and rmqueue_bulk() calls in + * a loop with the zone->lock held, meaning the freelists are + * not subject to any outside changes. Remember in *mode where + * we found pay dirt, to save us the search on the next call. + */ + switch (*mode) { + case RMQUEUE_NORMAL: + page = __rmqueue_smallest(zone, order, migratetype); + if (page) + return page; + fallthrough; + case RMQUEUE_CMA: + if (alloc_flags & ALLOC_CMA) { page = __rmqueue_cma_fallback(zone, order); - - if (!page) - page = __rmqueue_fallback(zone, order, migratetype, - alloc_flags); + if (page) { + *mode = RMQUEUE_CMA; + return page; + } + } + fallthrough; + case RMQUEUE_CLAIM: + page = __rmqueue_claim(zone, order, migratetype, alloc_flags); + if (page) { + /* Replenished preferred freelist, back to normal mode. */ + *mode = RMQUEUE_NORMAL; + return page; + } + fallthrough; + case RMQUEUE_STEAL: + if (!(alloc_flags & ALLOC_NOFRAGMENT)) { + page = __rmqueue_steal(zone, order, migratetype); + if (page) { + *mode = RMQUEUE_STEAL; + return page; + } + } } - return page; + return NULL; } /* @@ -2311,17 +2343,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; unsigned long flags; int i; - if (!spin_trylock_irqsave(&zone->lock, flags)) { - if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) return 0; + } else { spin_lock_irqsave(&zone->lock, flags); } for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, - alloc_flags); + alloc_flags, &rmqm); if (unlikely(page == NULL)) break; @@ -2634,7 +2668,7 @@ static void free_frozen_page_commit(struct zone *zone, free_high = (pcp->free_count >= batch && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(batch))); + pcp->count >= batch)); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; @@ -2937,15 +2971,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; - if (!spin_trylock_irqsave(&zone->lock, flags)) { - if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) return NULL; + } else { spin_lock_irqsave(&zone->lock, flags); } if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; + + page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); /* * If the allocation fails, allow OOM handling and @@ -3422,18 +3459,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, return false; } -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx) -{ - long free_pages = zone_page_state(z, NR_FREE_PAGES); - - if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) - free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - - return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, - free_pages); -} - #ifdef CONFIG_NUMA int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; @@ -4530,6 +4555,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } retry: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -6750,7 +6783,8 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, goto done; } - if (!(gfp_mask & __GFP_COMP)) { + if (!(gfp_mask & __GFP_COMP) || + (is_power_of_2(end - start) && ilog2(end - start) < MAX_PAGE_ORDER)) { split_free_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ @@ -6758,7 +6792,15 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, free_contig_range(outer_start, start - outer_start); if (end != outer_end) free_contig_range(end, outer_end - end); - } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { + + outer_start = start; + outer_end = end; + + if (!(gfp_mask & __GFP_COMP)) + goto done; + } + + if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); int order = ilog2(end - start); @@ -7143,6 +7185,11 @@ static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); static bool lazy_accept = true; +void unaccepted_cleanup_work(struct work_struct *work) +{ + static_branch_dec(&zones_with_unaccepted_pages); +} + static int __init accept_memory_parse(char *p) { if (!strcmp(p, "lazy")) { @@ -7181,8 +7228,27 @@ static void __accept_page(struct zone *zone, unsigned long *flags, __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - if (last) - static_branch_dec(&zones_with_unaccepted_pages); + if (last) { + /* + * There are two corner cases: + * + * - If allocation occurs during the CPU bring up, + * static_branch_dec() cannot be used directly as + * it causes a deadlock on cpu_hotplug_lock. + * + * Instead, use schedule_work() to prevent deadlock. + * + * - If allocation occurs before workqueues are initialized, + * static_branch_dec() should be called directly. + * + * Workqueues are initialized before CPU bring up, so this + * will not conflict with the first scenario. + */ + if (system_wq) + schedule_work(&zone->unaccepted_cleanup); + else + unaccepted_cleanup_work(&zone->unaccepted_cleanup); + } } void accept_page(struct page *page) diff --git a/mm/ptdump.c b/mm/ptdump.c index 106e1d66e9f9e..9374f29cdc6f8 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -18,7 +18,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0])); + st->note_page_pte(st, addr, kasan_early_shadow_pte[0]); walk->action = ACTION_CONTINUE; @@ -38,11 +38,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 0, pgd_val(val)); + if (st->effective_prot_pgd) + st->effective_prot_pgd(st, val); if (pgd_leaf(val)) { - st->note_page(st, addr, 0, pgd_val(val)); + st->note_page_pgd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -61,11 +61,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 1, p4d_val(val)); + if (st->effective_prot_p4d) + st->effective_prot_p4d(st, val); if (p4d_leaf(val)) { - st->note_page(st, addr, 1, p4d_val(val)); + st->note_page_p4d(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -84,11 +84,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 2, pud_val(val)); + if (st->effective_prot_pud) + st->effective_prot_pud(st, val); if (pud_leaf(val)) { - st->note_page(st, addr, 2, pud_val(val)); + st->note_page_pud(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -106,10 +106,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 3, pmd_val(val)); + if (st->effective_prot_pmd) + st->effective_prot_pmd(st, val); if (pmd_leaf(val)) { - st->note_page(st, addr, 3, pmd_val(val)); + st->note_page_pmd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -122,10 +122,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, struct ptdump_state *st = walk->private; pte_t val = ptep_get_lockless(pte); - if (st->effective_prot) - st->effective_prot(st, 4, pte_val(val)); + if (st->effective_prot_pte) + st->effective_prot_pte(st, val); - st->note_page(st, addr, 4, pte_val(val)); + st->note_page_pte(st, addr, val); return 0; } @@ -134,9 +134,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - - st->note_page(st, addr, depth, 0); - + pte_t pte_zero = {0}; + pmd_t pmd_zero = {0}; + pud_t pud_zero = {0}; + p4d_t p4d_zero = {0}; + pgd_t pgd_zero = {0}; + + switch (depth) { + case 4: + st->note_page_pte(st, addr, pte_zero); + break; + case 3: + st->note_page_pmd(st, addr, pmd_zero); + break; + case 2: + st->note_page_pud(st, addr, pud_zero); + break; + case 1: + st->note_page_p4d(st, addr, p4d_zero); + break; + case 0: + st->note_page_pgd(st, addr, pgd_zero); + break; + default: + break; + } return 0; } @@ -162,7 +184,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) mmap_write_unlock(mm); /* Flush out the last page */ - st->note_page(st, 0, -1, 0); + st->note_page_flush(st); } static int check_wx_show(struct seq_file *m, void *v) diff --git a/mm/show_mem.c b/mm/show_mem.c index 6af13bcd2ab36..ad373b4b6e394 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - + managed_pages += zone_managed_pages(zone); if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } + + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); val->totalhigh = managed_highpages; val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif val->mem_unit = PAGE_SIZE; } #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 2eff8b51a9455..e727021b8e2ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -52,9 +52,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); @@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -1346,10 +1346,12 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return NULL; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, + unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1381,7 +1383,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1452,71 +1454,104 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; + struct swap_cluster_info *ci; ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); - if (!usage) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } unlock_cluster(ci); - - return usage; } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster(ci); - goto fallback; + goto locked_fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) - swap_entry_range_free(si, ci, entry, nr); + swap_entries_free(si, ci, entry, nr); + else + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster(ci); return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; } + unlock_cluster(ci); return has_cache; + } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. */ -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; + } + + return has_cache; +} + +/* + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + */ +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. + */ +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; @@ -1529,7 +1564,7 @@ static void swap_entry_range_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); @@ -1542,21 +1577,6 @@ static void swap_entry_range_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - unsigned long end = offset + nr_pages; - - ci = lock_cluster(si, offset); - do { - if (!__swap_entry_free_locked(si, offset, usage)) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); - } while (++offset < end); - unlock_cluster(ci); -} - /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. @@ -1573,7 +1593,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -1584,8 +1604,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1593,16 +1611,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entry_range_free(si, ci, entry, size); - else { - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entry_range_free(si, ci, entry, 1); - } - } - unlock_cluster(ci); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) @@ -1797,7 +1806,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1807,13 +1816,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; @@ -3636,11 +3639,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) @@ -3780,7 +3785,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * Called while __swap_duplicate() or caller of swap_entry_put_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index fbf2cf62ab9f5..e8ce92dc105fe 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1063,7 +1063,7 @@ static int move_present_pte(struct mm_struct *mm, folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); @@ -1902,6 +1902,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, unsigned long end) { struct vm_area_struct *ret; + bool give_up_on_oom = false; + + /* + * If we are modifying only and not splitting, just give up on the merge + * if OOM prevents us from merging successfully. + */ + if (start == vma->vm_start && end == vma->vm_end) + give_up_on_oom = true; /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) @@ -1909,7 +1917,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -1960,7 +1968,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, new_flags, - (struct vm_userfaultfd_ctx){ctx}); + (struct vm_userfaultfd_ctx){ctx}, + /* give_up_on_oom = */false); if (IS_ERR(vma)) return PTR_ERR(vma); diff --git a/mm/vma.c b/mm/vma.c index 5cdc5612bfc19..8a6c5e8357593 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -57,6 +57,22 @@ struct mmap_state { .state = VMA_MERGE_START, \ } +/* + * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain + * more than one anon_vma_chain connecting it to more than one anon_vma. A merge + * would mean a wider range of folios sharing the root anon_vma lock, and thus + * potential lock contention, we do not wish to encourage merging such that this + * scales to a problem. + */ +static bool vma_had_uncowed_parents(struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); +} + static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; @@ -82,24 +98,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex return true; } -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, struct vm_area_struct *vma) +static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { + struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; + struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ + struct anon_vma *tgt_anon = tgt->anon_vma; + struct anon_vma *src_anon = vmg->anon_vma; + /* - * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. + * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we + * will remove the existing VMA's anon_vma's so there's no scalability + * concerns. */ - if ((!anon_vma1 || !anon_vma2) && (!vma || - list_is_singular(&vma->anon_vma_chain))) - return true; - return anon_vma1 == anon_vma2; -} + VM_WARN_ON(src && src_anon != src->anon_vma); -/* Are the anon_vma's belonging to each VMA compatible with one another? */ -static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, - struct vm_area_struct *vma2) -{ - return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); + /* Case 1 - we will dup_anon_vma() from src into tgt. */ + if (!tgt_anon && src_anon) + return !vma_had_uncowed_parents(src); + /* Case 2 - we will simply use tgt's anon_vma. */ + if (tgt_anon && !src_anon) + return !vma_had_uncowed_parents(tgt); + /* Case 3 - the anon_vma's are already shared. */ + return src_anon == tgt_anon; } /* @@ -164,7 +184,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } @@ -184,7 +204,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } @@ -400,8 +420,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg) static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { - if (!vmg->next || vmg->end != vmg->next->vm_start || - !can_vma_merge_before(vmg)) + struct vm_area_struct *next = vmg->next; + struct vm_area_struct *prev; + + if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) @@ -414,7 +436,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, * * We therefore check this in addition to mergeability to either side. */ - return are_anon_vmas_compatible(vmg->prev, vmg->next); + prev = vmg->prev; + return !prev->anon_vma || !next->anon_vma || + prev->anon_vma == next->anon_vma; } /* @@ -554,7 +578,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } /* - * dup_anon_vma() - Helper function to duplicate anon_vma + * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the + * instance that the destination VMA has no anon_vma but the source does. + * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. @@ -565,9 +591,18 @@ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* - * Easily overlooked: when mprotect shifts the boundary, make sure the - * expanding vma has anon_vma set if the shrinking vma had, to cover any - * anon pages imported. + * There are three cases to consider for correctly propagating + * anon_vma's on merge. + * + * The first is trivial - neither VMA has anon_vma, we need not do + * anything. + * + * The second where both have anon_vma is also a no-op, as they must + * then be the same, so there is simply nothing to copy. + * + * Here we cover the third - if the destination VMA has no anon_vma, + * that is it is unfaulted, we need to ensure that the newly merged + * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; @@ -666,6 +701,9 @@ static void vmg_adjust_set_range(struct vma_merge_struct *vmg) /* * Actually perform the VMA merge operation. * + * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not + * modify any VMAs or cause inconsistent state should an OOM condition arise. + * * Returns 0 on success, or an error value on failure. */ static int commit_merge(struct vma_merge_struct *vmg) @@ -685,6 +723,12 @@ static int commit_merge(struct vma_merge_struct *vmg) init_multi_vma_prep(&vp, vma, vmg); + /* + * If vmg->give_up_on_oom is set, we're safe, because we don't actually + * manipulate any VMAs until we succeed at preallocation. + * + * Past this point, we will not return an error. + */ if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; @@ -915,7 +959,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (anon_dup) unlink_anon_vmas(anon_dup); - vmg->state = VMA_MERGE_ERROR_NOMEM; + /* + * We've cleaned up any cloned anon_vma's, no VMAs have been + * modified, no harm no foul if the user requests that we not + * report this and just give up, leaving the VMAs unmerged. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } @@ -926,7 +976,15 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); - vmg->state = VMA_MERGE_ERROR_NOMEM; + + /* + * This means we have failed to clone anon_vma's correctly, but no + * actual changes to VMAs have occurred, so no harm no foul - if the + * user doesn't want this reported and instead just wants to give up on + * the merge, allow it. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } @@ -1068,6 +1126,10 @@ int vma_expand(struct vma_merge_struct *vmg) /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); + /* + * In this case we don't report OOM, so vmg->give_up_on_mm is + * safe. + */ ret = dup_anon_vma(middle, next, &anon_dup); if (ret) return ret; @@ -1090,9 +1152,15 @@ int vma_expand(struct vma_merge_struct *vmg) return 0; nomem: - vmg->state = VMA_MERGE_ERROR_NOMEM; if (anon_dup) unlink_anon_vmas(anon_dup); + /* + * If the user requests that we just give upon OOM, we are safe to do so + * here, as commit merge provides this contract to us. Nothing has been + * changed - no harm no foul, just don't report it. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } @@ -1534,6 +1602,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) if (vmg_nomem(vmg)) return ERR_PTR(-ENOMEM); + /* + * Split can fail for reasons other than OOM, so if the user requests + * this it's probably a mistake. + */ + VM_WARN_ON(vmg->give_up_on_oom && + (vma->vm_start != start || vma->vm_end != end)); + /* Split any preceding portion of the VMA. */ if (vma->vm_start < start) { int err = split_vma(vmg->vmi, vma, start, 1); @@ -1602,12 +1677,15 @@ struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) + struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.uffd_ctx = new_ctx; + if (give_up_on_oom) + vmg.give_up_on_oom = true; return vma_modify(&vmg); } diff --git a/mm/vma.h b/mm/vma.h index 7356ca5a22d33..149926e8a6d1a 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -114,6 +114,12 @@ struct vma_merge_struct { */ bool just_expand :1; + /* + * If a merge is possible, but an OOM error occurs, give up and don't + * execute the merge, returning NULL. + */ + bool give_up_on_oom :1; + /* Internal flags set during merge process: */ /* @@ -255,7 +261,8 @@ __must_check struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx); + struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ed720a787ecd..77da4613f07ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -900,6 +900,11 @@ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; +/* A simple iterator over all vmap-nodes. */ +#define for_each_vmap_node(vn) \ + for ((vn) = &vmap_nodes[0]; \ + (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++) + static inline unsigned int addr_to_node_id(unsigned long addr) { @@ -918,6 +923,19 @@ id_to_node(unsigned int id) return &vmap_nodes[id % nr_vmap_nodes]; } +static inline unsigned int +node_to_id(struct vmap_node *node) +{ + /* Pointer arithmetic. */ + unsigned int id = node - vmap_nodes; + + if (likely(id < nr_vmap_nodes)) + return id; + + WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); + return 0; +} + /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. @@ -1056,12 +1074,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; - int i; repeat: - for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + va_start_lowest = 0; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); @@ -2255,9 +2272,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, */ purge_nodes = CPU_MASK_NONE; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; - + for_each_vmap_node(vn) { INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); @@ -2276,7 +2291,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); - cpumask_set_cpu(i, &purge_nodes); + cpumask_set_cpu(node_to_id(vn), &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); @@ -2355,7 +2370,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* @@ -3370,12 +3385,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); + /* All pages of vm should be charged to same memcg, so use first one. */ + if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3671,12 +3687,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (gfp_mask & __GFP_ACCOUNT) { - int i; - - for (i = 0; i < area->nr_pages; i++) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); - } + /* All pages of vm should be charged to same memcg, so use first one. */ + if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) + mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, + area->nr_pages); /* * If not enough pages were obtained to accomplish an @@ -4942,11 +4956,8 @@ static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", @@ -4962,11 +4973,8 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { @@ -5087,7 +5095,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, n; + int i; #if BITS_PER_LONG == 64 /* @@ -5104,7 +5112,7 @@ static void vmap_init_nodes(void) * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ - n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); @@ -5119,8 +5127,7 @@ static void vmap_init_nodes(void) } #endif - for (n = 0; n < nr_vmap_nodes; n++) { - vn = &vmap_nodes[n]; + for_each_vmap_node(vn) { vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -5141,15 +5148,13 @@ static void vmap_init_nodes(void) static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long count; + unsigned long count = 0; struct vmap_node *vn; - int i, j; - - for (count = 0, i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + int i; - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) - count += READ_ONCE(vn->pool[j].len); + for_each_vmap_node(vn) { + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) + count += READ_ONCE(vn->pool[i].len); } return count ? count : SHRINK_EMPTY; @@ -5158,10 +5163,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - int i; + struct vmap_node *vn; - for (i = 0; i < nr_vmap_nodes; i++) - decay_va_pool_node(&vmap_nodes[i], true); + for_each_vmap_node(vn) + decay_va_pool_node(vn, true); return SHRINK_STOP; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b620d74b0f66e..3783e45bfc92e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6736,6 +6736,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * meet watermarks. */ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + enum zone_stat_item item; unsigned long free_pages; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) @@ -6746,11 +6747,33 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) /* * In defrag_mode, watermarks must be met in whole * blocks to avoid polluting allocator fallbacks. + * + * However, kswapd usually cannot accomplish this on + * its own and needs kcompactd support. Once it's + * reclaimed a compaction gap, and kswapd_shrink_node + * has dropped order, simply ensure there are enough + * base pages for compaction, wake kcompactd & sleep. */ - if (defrag_mode) - free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS); + if (defrag_mode && order) + item = NR_FREE_PAGES_BLOCKS; else - free_pages = zone_page_state(zone, NR_FREE_PAGES); + item = NR_FREE_PAGES; + + /* + * When there is a high number of CPUs in the system, + * the cumulative error from the vmstat per-cpu cache + * can blur the line between the watermarks. In that + * case, be safe and get an accurate snapshot. + * + * TODO: NR_FREE_PAGES_BLOCKS moves in steps of + * pageblock_nr_pages, while the vmstat pcp threshold + * is limited to 125. On many configurations that + * counter won't actually be per-cpu cached. But keep + * things simple for now; revisit when somebody cares. + */ + free_pages = zone_page_state(zone, item); + if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) + free_pages = zone_page_state_snapshot(zone, item); if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, 0, free_pages)) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4c268ce39ff29..ed08bb384ae40 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = { "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", + "numa_task_migrated", + "numa_task_swapped", #endif #ifdef CONFIG_MIGRATION "pgmigrate_success", diff --git a/mm/zpdesc.h b/mm/zpdesc.h index fa47fece22372..57e7a4d6c6caf 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -7,6 +7,9 @@ #ifndef __MM_ZPDESC_H__ #define __MM_ZPDESC_H__ +#include +#include + /* * struct zpdesc - Memory descriptor for zpool memory. * @flags: Page flags, mostly unused by zsmalloc. diff --git a/mm/zpool.c b/mm/zpool.c index 6d6d889309324..0a71d03369f15 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -226,20 +226,22 @@ const char *zpool_get_type(struct zpool *zpool) * @size: The amount of memory to allocate. * @gfp: The GFP flags to use when allocating memory. * @handle: Pointer to the handle to set + * @nid: The preferred node id. * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the * implementation supports it. The provided @handle will be - * set to the allocated object handle. + * set to the allocated object handle. The allocation will + * prefer the NUMA node specified by @nid. * * Implementations must guarantee this to be thread-safe. * * Returns: 0 on success, negative value on error. */ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - return zpool->driver->malloc(zpool->pool, size, gfp, handle); + return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 961b270f023c2..70406ac94bbd3 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -26,17 +26,10 @@ #include #include #include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include #include #include #include @@ -44,11 +37,8 @@ #include #include #include -#include -#include -#include #include -#include +#include #include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -243,9 +233,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); } -static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid) { - struct page *page = alloc_page(gfp); + struct page *page = alloc_pages_node(nid, gfp, 0); return page_zpdesc(page); } @@ -462,9 +452,9 @@ static void zs_zpool_destroy(void *pool) } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - *handle = zs_malloc(pool, size, gfp); + *handle = zs_malloc(pool, size, gfp, nid); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); @@ -1043,8 +1033,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, - struct size_class *class, - gfp_t gfp) + struct size_class *class, + gfp_t gfp, const int nid) { int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; @@ -1061,7 +1051,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, for (i = 0; i < class->pages_per_zspage; i++) { struct zpdesc *zpdesc; - zpdesc = alloc_zpdesc(gfp); + zpdesc = alloc_zpdesc(gfp, nid); if (!zpdesc) { while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); @@ -1336,12 +1326,14 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object + * @nid: The preferred node id to allocate new zspage (if needed) * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, + const int nid) { unsigned long handle; struct size_class *class; @@ -1376,7 +1368,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_unlock(&class->lock); - zspage = alloc_zspage(pool, class, gfp); + zspage = alloc_zspage(pool, class, gfp, nid); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); diff --git a/mm/zswap.c b/mm/zswap.c index 204fb59da33cd..455e9425c5f52 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -981,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 0ddc4c7188332..2769346730667 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -530,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; @@ -556,7 +562,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -564,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -578,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -590,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -601,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; @@ -610,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -619,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - int err = 0; - if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - skb_get(skb); - err = sk_psock_skb_ingress(psock, skb, off, len); - if (err < 0) - kfree_skb(skb); - return err; + + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, @@ -656,11 +657,6 @@ static void sk_psock_backlog(struct work_struct *work) int ret; mutex_lock(&psock->work_mutex); - if (unlikely(state->len)) { - len = state->len; - off = state->off; - } - while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; @@ -670,6 +666,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -680,7 +683,8 @@ static void sk_psock_backlog(struct work_struct *work) if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); - + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); /* Delay slightly to prioritize any * other work that might be here. */ @@ -697,6 +701,8 @@ static void sk_psock_backlog(struct work_struct *work) len -= ret; } while (len); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } @@ -1014,7 +1020,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 914d4e1516a3c..f3d7d19482da9 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1120,9 +1120,13 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, num_async++; else if (ret == -ENOMEM) goto wait_for_memory; - else if (ctx->open_rec && ret == -ENOSPC) + else if (ctx->open_rec && ret == -ENOSPC) { + if (msg_pl->cork_bytes) { + ret = 0; + goto send_end; + } goto rollback_iter; - else if (ret != -EAGAIN) + } else if (ret != -EAGAIN) goto send_end; } continue; diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index e1c21eba9b15b..48b80bbc0645b 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -19,6 +19,7 @@ #include "io.c" #include "jump_label.c" #include "kunit.c" +#include "mm.c" #include "mutex.c" #include "page.c" #include "platform.c" diff --git a/rust/helpers/mm.c b/rust/helpers/mm.c new file mode 100644 index 0000000000000..81b510c96fd26 --- /dev/null +++ b/rust/helpers/mm.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void rust_helper_mmgrab(struct mm_struct *mm) +{ + mmgrab(mm); +} + +void rust_helper_mmdrop(struct mm_struct *mm) +{ + mmdrop(mm); +} + +void rust_helper_mmget(struct mm_struct *mm) +{ + mmget(mm); +} + +bool rust_helper_mmget_not_zero(struct mm_struct *mm) +{ + return mmget_not_zero(mm); +} + +void rust_helper_mmap_read_lock(struct mm_struct *mm) +{ + mmap_read_lock(mm); +} + +bool rust_helper_mmap_read_trylock(struct mm_struct *mm) +{ + return mmap_read_trylock(mm); +} + +void rust_helper_mmap_read_unlock(struct mm_struct *mm) +{ + mmap_read_unlock(mm); +} + +struct vm_area_struct *rust_helper_vma_lookup(struct mm_struct *mm, + unsigned long addr) +{ + return vma_lookup(mm, addr); +} + +void rust_helper_vma_end_read(struct vm_area_struct *vma) +{ + vma_end_read(vma); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index de07aadd1ff5f..42ab6cf4053f2 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -61,6 +61,7 @@ pub mod jump_label; pub mod kunit; pub mod list; pub mod miscdevice; +pub mod mm; #[cfg(CONFIG_NET)] pub mod net; pub mod of; diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs index fa9ecc42602a4..9d9771247c386 100644 --- a/rust/kernel/miscdevice.rs +++ b/rust/kernel/miscdevice.rs @@ -14,6 +14,7 @@ use crate::{ error::{to_result, Error, Result, VTABLE_DEFAULT_ERROR}, ffi::{c_int, c_long, c_uint, c_ulong}, fs::File, + mm::virt::VmaNew, prelude::*, seq_file::SeqFile, str::CStr, @@ -119,6 +120,22 @@ pub trait MiscDevice: Sized { drop(device); } + /// Handle for mmap. + /// + /// This function is invoked when a user space process invokes the `mmap` system call on + /// `file`. The function is a callback that is part of the VMA initializer. The kernel will do + /// initial setup of the VMA before calling this function. The function can then interact with + /// the VMA initialization by calling methods of `vma`. If the function does not return an + /// error, the kernel will complete initialization of the VMA according to the properties of + /// `vma`. + fn mmap( + _device: ::Borrowed<'_>, + _file: &File, + _vma: &VmaNew, + ) -> Result { + build_error!(VTABLE_DEFAULT_ERROR) + } + /// Handler for ioctls. /// /// The `cmd` argument is usually manipulated using the utilties in [`kernel::ioctl`]. @@ -223,6 +240,33 @@ impl MiscdeviceVTable { 0 } + /// # Safety + /// + /// `file` must be a valid file that is associated with a `MiscDeviceRegistration`. + /// `vma` must be a vma that is currently being mmap'ed with this file. + unsafe extern "C" fn mmap( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, + ) -> c_int { + // SAFETY: The mmap call of a file can access the private data. + let private = unsafe { (*file).private_data }; + // SAFETY: This is a Rust Miscdevice, so we call `into_foreign` in `open` and + // `from_foreign` in `release`, and `fops_mmap` is guaranteed to be called between those + // two operations. + let device = unsafe { ::borrow(private) }; + // SAFETY: The caller provides a vma that is undergoing initial VMA setup. + let area = unsafe { VmaNew::from_raw(vma) }; + // SAFETY: + // * The file is valid for the duration of this call. + // * There is no active fdget_pos region on the file on this thread. + let file = unsafe { File::from_raw_file(file) }; + + match T::mmap(device, file, area) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } + } + /// # Safety /// /// `file` must be a valid file that is associated with a `MiscDeviceRegistration`. @@ -291,6 +335,7 @@ impl MiscdeviceVTable { const VTABLE: bindings::file_operations = bindings::file_operations { open: Some(Self::open), release: Some(Self::release), + mmap: if T::HAS_MMAP { Some(Self::mmap) } else { None }, unlocked_ioctl: if T::HAS_IOCTL { Some(Self::ioctl) } else { diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs new file mode 100644 index 0000000000000..615907a0f3b48 --- /dev/null +++ b/rust/kernel/mm.rs @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Memory management. +//! +//! This module deals with managing the address space of userspace processes. Each process has an +//! instance of [`Mm`], which keeps track of multiple VMAs (virtual memory areas). Each VMA +//! corresponds to a region of memory that the userspace process can access, and the VMA lets you +//! control what happens when userspace reads or writes to that region of memory. +//! +//! C header: [`include/linux/mm.h`](srctree/include/linux/mm.h) +#![cfg(CONFIG_MMU)] + +use crate::{ + bindings, + types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, +}; +use core::{ops::Deref, ptr::NonNull}; + +pub mod virt; +use virt::VmaRef; + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This represents the address space of a userspace process, so each process has one `Mm` +/// instance. It may hold many VMAs internally. +/// +/// There is a counter called `mm_users` that counts the users of the address space; this includes +/// the userspace process itself, but can also include kernel threads accessing the address space. +/// Once `mm_users` reaches zero, this indicates that the address space can be destroyed. To access +/// the address space, you must prevent `mm_users` from reaching zero while you are accessing it. +/// The [`MmWithUser`] type represents an address space where this is guaranteed, and you can +/// create one using [`mmget_not_zero`]. +/// +/// The `ARef` smart pointer holds an `mmgrab` refcount. Its destructor may sleep. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmgrab`. +/// +/// [`mmget_not_zero`]: Mm::mmget_not_zero +#[repr(transparent)] +pub struct Mm { + mm: Opaque, +} + +// SAFETY: It is safe to call `mmdrop` on another thread than where `mmgrab` was called. +unsafe impl Send for Mm {} +// SAFETY: All methods on `Mm` can be called in parallel from several threads. +unsafe impl Sync for Mm {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for Mm { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmgrab(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmdrop(obj.cast().as_ptr()) }; + } +} + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This type is like [`Mm`], but with non-zero `mm_users`. It can only be used when `mm_users` can +/// be proven to be non-zero at compile-time, usually because the relevant code holds an `mmget` +/// refcount. It can be used to access the associated address space. +/// +/// The `ARef` smart pointer holds an `mmget` refcount. Its destructor may sleep. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero. +#[repr(transparent)] +pub struct MmWithUser { + mm: Mm, +} + +// SAFETY: It is safe to call `mmput` on another thread than where `mmget` was called. +unsafe impl Send for MmWithUser {} +// SAFETY: All methods on `MmWithUser` can be called in parallel from several threads. +unsafe impl Sync for MmWithUser {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for MmWithUser { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmget(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmput(obj.cast().as_ptr()) }; + } +} + +// Make all `Mm` methods available on `MmWithUser`. +impl Deref for MmWithUser { + type Target = Mm; + + #[inline] + fn deref(&self) -> &Mm { + &self.mm + } +} + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This type is identical to `MmWithUser` except that it uses `mmput_async` when dropping a +/// refcount. This means that the destructor of `ARef` is safe to call in atomic +/// context. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero. +#[repr(transparent)] +pub struct MmWithUserAsync { + mm: MmWithUser, +} + +// SAFETY: It is safe to call `mmput_async` on another thread than where `mmget` was called. +unsafe impl Send for MmWithUserAsync {} +// SAFETY: All methods on `MmWithUserAsync` can be called in parallel from several threads. +unsafe impl Sync for MmWithUserAsync {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for MmWithUserAsync { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmget(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmput_async(obj.cast().as_ptr()) }; + } +} + +// Make all `MmWithUser` methods available on `MmWithUserAsync`. +impl Deref for MmWithUserAsync { + type Target = MmWithUser; + + #[inline] + fn deref(&self) -> &MmWithUser { + &self.mm + } +} + +// These methods are safe to call even if `mm_users` is zero. +impl Mm { + /// Returns a raw pointer to the inner `mm_struct`. + #[inline] + pub fn as_raw(&self) -> *mut bindings::mm_struct { + self.mm.get() + } + + /// Obtain a reference from a raw pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` points at an `mm_struct`, and that it is not deallocated + /// during the lifetime 'a. + #[inline] + pub unsafe fn from_raw<'a>(ptr: *const bindings::mm_struct) -> &'a Mm { + // SAFETY: Caller promises that the pointer is valid for 'a. Layouts are compatible due to + // repr(transparent). + unsafe { &*ptr.cast() } + } + + /// Calls `mmget_not_zero` and returns a handle if it succeeds. + #[inline] + pub fn mmget_not_zero(&self) -> Option> { + // SAFETY: The pointer is valid since self is a reference. + let success = unsafe { bindings::mmget_not_zero(self.as_raw()) }; + + if success { + // SAFETY: We just created an `mmget` refcount. + Some(unsafe { ARef::from_raw(NonNull::new_unchecked(self.as_raw().cast())) }) + } else { + None + } + } +} + +// These methods require `mm_users` to be non-zero. +impl MmWithUser { + /// Obtain a reference from a raw pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` points at an `mm_struct`, and that `mm_users` remains + /// non-zero for the duration of the lifetime 'a. + #[inline] + pub unsafe fn from_raw<'a>(ptr: *const bindings::mm_struct) -> &'a MmWithUser { + // SAFETY: Caller promises that the pointer is valid for 'a. The layout is compatible due + // to repr(transparent). + unsafe { &*ptr.cast() } + } + + /// Use `mmput_async` when dropping this refcount. + #[inline] + pub fn into_mmput_async(me: ARef) -> ARef { + // SAFETY: The layouts and invariants are compatible. + unsafe { ARef::from_raw(ARef::into_raw(me).cast()) } + } + + /// Attempt to access a vma using the vma read lock. + /// + /// This is an optimistic trylock operation, so it may fail if there is contention. In that + /// case, you should fall back to taking the mmap read lock. + /// + /// When per-vma locks are disabled, this always returns `None`. + #[inline] + pub fn lock_vma_under_rcu(&self, vma_addr: usize) -> Option> { + #[cfg(CONFIG_PER_VMA_LOCK)] + { + // SAFETY: Calling `bindings::lock_vma_under_rcu` is always okay given an mm where + // `mm_users` is non-zero. + let vma = unsafe { bindings::lock_vma_under_rcu(self.as_raw(), vma_addr) }; + if !vma.is_null() { + return Some(VmaReadGuard { + // SAFETY: If `lock_vma_under_rcu` returns a non-null ptr, then it points at a + // valid vma. The vma is stable for as long as the vma read lock is held. + vma: unsafe { VmaRef::from_raw(vma) }, + _nts: NotThreadSafe, + }); + } + } + + // Silence warnings about unused variables. + #[cfg(not(CONFIG_PER_VMA_LOCK))] + let _ = vma_addr; + + None + } + + /// Lock the mmap read lock. + #[inline] + pub fn mmap_read_lock(&self) -> MmapReadGuard<'_> { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmap_read_lock(self.as_raw()) }; + + // INVARIANT: We just acquired the read lock. + MmapReadGuard { + mm: self, + _nts: NotThreadSafe, + } + } + + /// Try to lock the mmap read lock. + #[inline] + pub fn mmap_read_trylock(&self) -> Option> { + // SAFETY: The pointer is valid since self is a reference. + let success = unsafe { bindings::mmap_read_trylock(self.as_raw()) }; + + if success { + // INVARIANT: We just acquired the read lock. + Some(MmapReadGuard { + mm: self, + _nts: NotThreadSafe, + }) + } else { + None + } + } +} + +/// A guard for the mmap read lock. +/// +/// # Invariants +/// +/// This `MmapReadGuard` guard owns the mmap read lock. +pub struct MmapReadGuard<'a> { + mm: &'a MmWithUser, + // `mmap_read_lock` and `mmap_read_unlock` must be called on the same thread + _nts: NotThreadSafe, +} + +impl<'a> MmapReadGuard<'a> { + /// Look up a vma at the given address. + #[inline] + pub fn vma_lookup(&self, vma_addr: usize) -> Option<&virt::VmaRef> { + // SAFETY: By the type invariants we hold the mmap read guard, so we can safely call this + // method. Any value is okay for `vma_addr`. + let vma = unsafe { bindings::vma_lookup(self.mm.as_raw(), vma_addr) }; + + if vma.is_null() { + None + } else { + // SAFETY: We just checked that a vma was found, so the pointer references a valid vma. + // + // Furthermore, the returned vma is still under the protection of the read lock guard + // and can be used while the mmap read lock is still held. That the vma is not used + // after the MmapReadGuard gets dropped is enforced by the borrow-checker. + unsafe { Some(virt::VmaRef::from_raw(vma)) } + } + } +} + +impl Drop for MmapReadGuard<'_> { + #[inline] + fn drop(&mut self) { + // SAFETY: We hold the read lock by the type invariants. + unsafe { bindings::mmap_read_unlock(self.mm.as_raw()) }; + } +} + +/// A guard for the vma read lock. +/// +/// # Invariants +/// +/// This `VmaReadGuard` guard owns the vma read lock. +pub struct VmaReadGuard<'a> { + vma: &'a VmaRef, + // `vma_end_read` must be called on the same thread as where the lock was taken + _nts: NotThreadSafe, +} + +// Make all `VmaRef` methods available on `VmaReadGuard`. +impl Deref for VmaReadGuard<'_> { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + self.vma + } +} + +impl Drop for VmaReadGuard<'_> { + #[inline] + fn drop(&mut self) { + // SAFETY: We hold the read lock by the type invariants. + unsafe { bindings::vma_end_read(self.vma.as_ptr()) }; + } +} diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs new file mode 100644 index 0000000000000..31803674aecc5 --- /dev/null +++ b/rust/kernel/mm/virt.rs @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Virtual memory. +//! +//! This module deals with managing a single VMA in the address space of a userspace process. Each +//! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets +//! you control what happens when userspace reads or writes to that region of memory. +//! +//! The module has several different Rust types that all correspond to the C type called +//! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g. +//! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct +//! ensures that you can't, for example, accidentally call a function that requires holding the +//! write lock when you only hold the read lock. + +use crate::{ + bindings, + error::{code::EINVAL, to_result, Result}, + mm::MmWithUser, + page::Page, + types::Opaque, +}; + +use core::ops::Deref; + +/// A wrapper for the kernel's `struct vm_area_struct` with read access. +/// +/// It represents an area of virtual memory. +/// +/// # Invariants +/// +/// The caller must hold the mmap read lock or the vma read lock. +#[repr(transparent)] +pub struct VmaRef { + vma: Opaque, +} + +// Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable +// no matter what the vma flags are. +impl VmaRef { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma + /// read lock (or stronger) is held for at least the duration of 'a. + #[inline] + pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Returns a raw pointer to this area. + #[inline] + pub fn as_ptr(&self) -> *mut bindings::vm_area_struct { + self.vma.get() + } + + /// Access the underlying `mm_struct`. + #[inline] + pub fn mm(&self) -> &MmWithUser { + // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma + // read lock or stronger. This implies that the underlying mm has a non-zero value of + // `mm_users`. + unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) } + } + + /// Returns the flags associated with the virtual memory area. + /// + /// The possible flags are a combination of the constants in [`flags`]. + #[inline] + pub fn flags(&self) -> vm_flags_t { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags } + } + + /// Returns the (inclusive) start address of the virtual memory area. + #[inline] + pub fn start(&self) -> usize { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start } + } + + /// Returns the (exclusive) end address of the virtual memory area. + #[inline] + pub fn end(&self) -> usize { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end } + } + + /// Zap pages in the given page range. + /// + /// This clears page table mappings for the range at the leaf level, leaving all other page + /// tables intact, and freeing any memory referenced by the VMA in this range. That is, + /// anonymous memory is completely freed, file-backed memory has its reference count on page + /// cache folio's dropped, any dirty data will still be written back to disk as usual. + /// + /// It may seem odd that we clear at the leaf level, this is however a product of the page + /// table structure used to map physical memory into a virtual address space - each virtual + /// address actually consists of a bitmap of array indices into page tables, which form a + /// hierarchical page table level structure. + /// + /// As a result, each page table level maps a multiple of page table levels below, and thus + /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical + /// memory. + /// + /// It is here where a zap operates, as it the only place we can be certain of clearing without + /// impacting any other virtual mappings. It is an implementation detail as to whether the + /// kernel goes further in freeing unused page tables, but for the purposes of this operation + /// we must only assume that the leaf level is cleared. + #[inline] + pub fn zap_page_range_single(&self, address: usize, size: usize) { + let (end, did_overflow) = address.overflowing_add(size); + if did_overflow || address < self.start() || self.end() < end { + // TODO: call WARN_ONCE once Rust version of it is added + return; + } + + // SAFETY: By the type invariants, the caller has read access to this VMA, which is + // sufficient for this method call. This method has no requirements on the vma flags. The + // address range is checked to be within the vma. + unsafe { + bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) + }; + } + + /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise + /// returns `None`. + /// + /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set. + /// + /// [`VM_MIXEDMAP`]: flags::MIXEDMAP + #[inline] + pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> { + if self.flags() & flags::MIXEDMAP != 0 { + // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are + // satisfied by the type invariants of `VmaRef`. + Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) }) + } else { + None + } + } +} + +/// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set. +/// +/// It represents an area of virtual memory. +/// +/// This struct is identical to [`VmaRef`] except that it must only be used when the +/// [`VM_MIXEDMAP`] flag is set on the vma. +/// +/// # Invariants +/// +/// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be +/// set. +/// +/// [`VM_MIXEDMAP`]: flags::MIXEDMAP +#[repr(transparent)] +pub struct VmaMixedMap { + vma: VmaRef, +} + +// Make all `VmaRef` methods available on `VmaMixedMap`. +impl Deref for VmaMixedMap { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + &self.vma + } +} + +impl VmaMixedMap { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock + /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set. + #[inline] + pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Maps a single page at the given address within the virtual memory area. + /// + /// This operation does not take ownership of the page. + #[inline] + pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result { + // SAFETY: By the type invariant of `Self` caller has read access and has verified that + // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0. + to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) }) + } +} + +/// A configuration object for setting up a VMA in an `f_ops->mmap()` hook. +/// +/// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to +/// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that +/// you are allowed to perform operations on the VMA that can only be performed before the VMA is +/// fully initialized. +/// +/// # Invariants +/// +/// For the duration of 'a, the referenced vma must be undergoing initialization in an +/// `f_ops->mmap()` hook. +pub struct VmaNew { + vma: VmaRef, +} + +// Make all `VmaRef` methods available on `VmaNew`. +impl Deref for VmaNew { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + &self.vma + } +} + +impl VmaNew { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a. + #[inline] + pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Internal method for updating the vma flags. + /// + /// # Safety + /// + /// This must not be used to set the flags to an invalid value. + #[inline] + unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) { + let mut flags = self.flags(); + flags |= set; + flags &= !unset; + + // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet + // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel. + // The caller promises that this does not set the flags to an invalid value. + unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags }; + } + + /// Set the `VM_MIXEDMAP` flag on this vma. + /// + /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference + /// that can be used to call `vm_insert_page` on the vma. + #[inline] + pub fn set_mixedmap(&self) -> &VmaMixedMap { + // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an + // invalid state. + unsafe { self.update_flags(flags::MIXEDMAP, 0) }; + + // SAFETY: We just set `VM_MIXEDMAP` on the vma. + unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) } + } + + /// Set the `VM_IO` flag on this vma. + /// + /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to + /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages + /// could have side effects. + #[inline] + pub fn set_io(&self) { + // SAFETY: Setting the VM_IO flag is always okay. + unsafe { self.update_flags(flags::IO, 0) }; + } + + /// Set the `VM_DONTEXPAND` flag on this vma. + /// + /// This prevents the vma from being expanded with `mremap()`. + #[inline] + pub fn set_dontexpand(&self) { + // SAFETY: Setting the VM_DONTEXPAND flag is always okay. + unsafe { self.update_flags(flags::DONTEXPAND, 0) }; + } + + /// Set the `VM_DONTCOPY` flag on this vma. + /// + /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO` + /// is set. + #[inline] + pub fn set_dontcopy(&self) { + // SAFETY: Setting the VM_DONTCOPY flag is always okay. + unsafe { self.update_flags(flags::DONTCOPY, 0) }; + } + + /// Set the `VM_DONTDUMP` flag on this vma. + /// + /// This prevents the vma from being included in core dumps. This option is only permanent if + /// `VM_IO` is set. + #[inline] + pub fn set_dontdump(&self) { + // SAFETY: Setting the VM_DONTDUMP flag is always okay. + unsafe { self.update_flags(flags::DONTDUMP, 0) }; + } + + /// Returns whether `VM_READ` is set. + /// + /// This flag indicates whether userspace is mapping this vma as readable. + #[inline] + pub fn readable(&self) -> bool { + (self.flags() & flags::READ) != 0 + } + + /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma readable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_mayread(&self) -> Result { + if self.readable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set. + unsafe { self.update_flags(0, flags::MAYREAD) }; + Ok(()) + } + + /// Returns whether `VM_WRITE` is set. + /// + /// This flag indicates whether userspace is mapping this vma as writable. + #[inline] + pub fn writable(&self) -> bool { + (self.flags() & flags::WRITE) != 0 + } + + /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma writable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_maywrite(&self) -> Result { + if self.writable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set. + unsafe { self.update_flags(0, flags::MAYWRITE) }; + Ok(()) + } + + /// Returns whether `VM_EXEC` is set. + /// + /// This flag indicates whether userspace is mapping this vma as executable. + #[inline] + pub fn executable(&self) -> bool { + (self.flags() & flags::EXEC) != 0 + } + + /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma executable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_mayexec(&self) -> Result { + if self.executable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set. + unsafe { self.update_flags(0, flags::MAYEXEC) }; + Ok(()) + } +} + +/// The integer type used for vma flags. +#[doc(inline)] +pub use bindings::vm_flags_t; + +/// All possible flags for [`VmaRef`]. +pub mod flags { + use super::vm_flags_t; + use crate::bindings; + + /// No flags are set. + pub const NONE: vm_flags_t = bindings::VM_NONE as _; + + /// Mapping allows reads. + pub const READ: vm_flags_t = bindings::VM_READ as _; + + /// Mapping allows writes. + pub const WRITE: vm_flags_t = bindings::VM_WRITE as _; + + /// Mapping allows execution. + pub const EXEC: vm_flags_t = bindings::VM_EXEC as _; + + /// Mapping is shared. + pub const SHARED: vm_flags_t = bindings::VM_SHARED as _; + + /// Mapping may be updated to allow reads. + pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as _; + + /// Mapping may be updated to allow writes. + pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as _; + + /// Mapping may be updated to allow execution. + pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as _; + + /// Mapping may be updated to be shared. + pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as _; + + /// Page-ranges managed without `struct page`, just pure PFN. + pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as _; + + /// Memory mapped I/O or similar. + pub const IO: vm_flags_t = bindings::VM_IO as _; + + /// Do not copy this vma on fork. + pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as _; + + /// Cannot expand with mremap(). + pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as _; + + /// Lock the pages covered when they are faulted in. + pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as _; + + /// Is a VM accounted object. + pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as _; + + /// Should the VM suppress accounting. + pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as _; + + /// Huge TLB Page VM. + pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as _; + + /// Synchronous page faults. (DAX-specific) + pub const SYNC: vm_flags_t = bindings::VM_SYNC as _; + + /// Architecture-specific flag. + pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as _; + + /// Wipe VMA contents in child on fork. + pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as _; + + /// Do not include in the core dump. + pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as _; + + /// Not soft dirty clean area. + pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as _; + + /// Can contain `struct page` and pure PFN pages. + pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as _; + + /// MADV_HUGEPAGE marked this vma. + pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as _; + + /// MADV_NOHUGEPAGE marked this vma. + pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as _; + + /// KSM may merge identical pages. + pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as _; +} diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 9e6f6854948d9..927413d854846 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -7,6 +7,7 @@ use crate::{ bindings, ffi::{c_int, c_long, c_uint}, + mm::MmWithUser, pid_namespace::PidNamespace, types::{ARef, NotThreadSafe, Opaque}, }; @@ -33,22 +34,20 @@ pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; #[macro_export] macro_rules! current { () => { - // SAFETY: Deref + addr-of below create a temporary `TaskRef` that cannot outlive the - // caller. + // SAFETY: This expression creates a temporary value that is dropped at the end of the + // caller's scope. The following mechanisms ensure that the resulting `&CurrentTask` cannot + // leave current task context: + // + // * To return to userspace, the caller must leave the current scope. + // * Operations such as `begin_new_exec()` are necessarily unsafe and the caller of + // `begin_new_exec()` is responsible for safety. + // * Rust abstractions for things such as a `kthread_use_mm()` scope must require the + // closure to be `Send`, so the `NotThreadSafe` field of `CurrentTask` ensures that the + // `&CurrentTask` cannot cross the scope in either direction. unsafe { &*$crate::task::Task::current() } }; } -/// Returns the currently running task's pid namespace. -#[macro_export] -macro_rules! current_pid_ns { - () => { - // SAFETY: Deref + addr-of below create a temporary `PidNamespaceRef` that cannot outlive - // the caller. - unsafe { &*$crate::task::Task::current_pid_ns() } - }; -} - /// Wraps the kernel's `struct task_struct`. /// /// # Invariants @@ -87,7 +86,7 @@ macro_rules! current_pid_ns { /// impl State { /// fn new() -> Self { /// Self { -/// creator: current!().into(), +/// creator: ARef::from(&**current!()), /// index: 0, /// } /// } @@ -107,6 +106,44 @@ unsafe impl Send for Task {} // synchronised by C code (e.g., `signal_pending`). unsafe impl Sync for Task {} +/// Represents the [`Task`] in the `current` global. +/// +/// This type exists to provide more efficient operations that are only valid on the current task. +/// For example, to retrieve the pid-namespace of a task, you must use rcu protection unless it is +/// the current task. +/// +/// # Invariants +/// +/// Each value of this type must only be accessed from the task context it was created within. +/// +/// Of course, every thread is in a different task context, but for the purposes of this invariant, +/// these operations also permanently leave the task context: +/// +/// * Returning to userspace from system call context. +/// * Calling `release_task()`. +/// * Calling `begin_new_exec()` in a binary format loader. +/// +/// Other operations temporarily create a new sub-context: +/// +/// * Calling `kthread_use_mm()` creates a new context, and `kthread_unuse_mm()` returns to the +/// old context. +/// +/// This means that a `CurrentTask` obtained before a `kthread_use_mm()` call may be used again +/// once `kthread_unuse_mm()` is called, but it must not be used between these two calls. +/// Conversely, a `CurrentTask` obtained between a `kthread_use_mm()`/`kthread_unuse_mm()` pair +/// must not be used after `kthread_unuse_mm()`. +#[repr(transparent)] +pub struct CurrentTask(Task, NotThreadSafe); + +// Make all `Task` methods available on `CurrentTask`. +impl Deref for CurrentTask { + type Target = Task; + #[inline] + fn deref(&self) -> &Task { + &self.0 + } +} + /// The type of process identifiers (PIDs). pub type Pid = bindings::pid_t; @@ -133,119 +170,29 @@ impl Task { /// /// # Safety /// - /// Callers must ensure that the returned object doesn't outlive the current task/thread. - pub unsafe fn current() -> impl Deref { - struct TaskRef<'a> { - task: &'a Task, - _not_send: NotThreadSafe, + /// Callers must ensure that the returned object is only used to access a [`CurrentTask`] + /// within the task context that was active when this function was called. For more details, + /// see the invariants section for [`CurrentTask`]. + pub unsafe fn current() -> impl Deref { + struct TaskRef { + task: *const CurrentTask, } - impl Deref for TaskRef<'_> { - type Target = Task; + impl Deref for TaskRef { + type Target = CurrentTask; fn deref(&self) -> &Self::Target { - self.task + // SAFETY: The returned reference borrows from this `TaskRef`, so it cannot outlive + // the `TaskRef`, which the caller of `Task::current()` has promised will not + // outlive the task/thread for which `self.task` is the `current` pointer. Thus, it + // is okay to return a `CurrentTask` reference here. + unsafe { &*self.task } } } - let current = Task::current_raw(); TaskRef { - // SAFETY: If the current thread is still running, the current task is valid. Given - // that `TaskRef` is not `Send`, we know it cannot be transferred to another thread - // (where it could potentially outlive the caller). - task: unsafe { &*current.cast() }, - _not_send: NotThreadSafe, - } - } - - /// Returns a PidNamespace reference for the currently executing task's/thread's pid namespace. - /// - /// This function can be used to create an unbounded lifetime by e.g., storing the returned - /// PidNamespace in a global variable which would be a bug. So the recommended way to get the - /// current task's/thread's pid namespace is to use the [`current_pid_ns`] macro because it is - /// safe. - /// - /// # Safety - /// - /// Callers must ensure that the returned object doesn't outlive the current task/thread. - pub unsafe fn current_pid_ns() -> impl Deref { - struct PidNamespaceRef<'a> { - task: &'a PidNamespace, - _not_send: NotThreadSafe, - } - - impl Deref for PidNamespaceRef<'_> { - type Target = PidNamespace; - - fn deref(&self) -> &Self::Target { - self.task - } - } - - // The lifetime of `PidNamespace` is bound to `Task` and `struct pid`. - // - // The `PidNamespace` of a `Task` doesn't ever change once the `Task` is alive. A - // `unshare(CLONE_NEWPID)` or `setns(fd_pidns/pidfd, CLONE_NEWPID)` will not have an effect - // on the calling `Task`'s pid namespace. It will only effect the pid namespace of children - // created by the calling `Task`. This invariant guarantees that after having acquired a - // reference to a `Task`'s pid namespace it will remain unchanged. - // - // When a task has exited and been reaped `release_task()` will be called. This will set - // the `PidNamespace` of the task to `NULL`. So retrieving the `PidNamespace` of a task - // that is dead will return `NULL`. Note, that neither holding the RCU lock nor holding a - // referencing count to - // the `Task` will prevent `release_task()` being called. - // - // In order to retrieve the `PidNamespace` of a `Task` the `task_active_pid_ns()` function - // can be used. There are two cases to consider: - // - // (1) retrieving the `PidNamespace` of the `current` task - // (2) retrieving the `PidNamespace` of a non-`current` task - // - // From system call context retrieving the `PidNamespace` for case (1) is always safe and - // requires neither RCU locking nor a reference count to be held. Retrieving the - // `PidNamespace` after `release_task()` for current will return `NULL` but no codepath - // like that is exposed to Rust. - // - // Retrieving the `PidNamespace` from system call context for (2) requires RCU protection. - // Accessing `PidNamespace` outside of RCU protection requires a reference count that - // must've been acquired while holding the RCU lock. Note that accessing a non-`current` - // task means `NULL` can be returned as the non-`current` task could have already passed - // through `release_task()`. - // - // To retrieve (1) the `current_pid_ns!()` macro should be used which ensure that the - // returned `PidNamespace` cannot outlive the calling scope. The associated - // `current_pid_ns()` function should not be called directly as it could be abused to - // created an unbounded lifetime for `PidNamespace`. The `current_pid_ns!()` macro allows - // Rust to handle the common case of accessing `current`'s `PidNamespace` without RCU - // protection and without having to acquire a reference count. - // - // For (2) the `task_get_pid_ns()` method must be used. This will always acquire a - // reference on `PidNamespace` and will return an `Option` to force the caller to - // explicitly handle the case where `PidNamespace` is `None`, something that tends to be - // forgotten when doing the equivalent operation in `C`. Missing RCU primitives make it - // difficult to perform operations that are otherwise safe without holding a reference - // count as long as RCU protection is guaranteed. But it is not important currently. But we - // do want it in the future. - // - // Note for (2) the required RCU protection around calling `task_active_pid_ns()` - // synchronizes against putting the last reference of the associated `struct pid` of - // `task->thread_pid`. The `struct pid` stored in that field is used to retrieve the - // `PidNamespace` of the caller. When `release_task()` is called `task->thread_pid` will be - // `NULL`ed and `put_pid()` on said `struct pid` will be delayed in `free_pid()` via - // `call_rcu()` allowing everyone with an RCU protected access to the `struct pid` acquired - // from `task->thread_pid` to finish. - // - // SAFETY: The current task's pid namespace is valid as long as the current task is running. - let pidns = unsafe { bindings::task_active_pid_ns(Task::current_raw()) }; - PidNamespaceRef { - // SAFETY: If the current thread is still running, the current task and its associated - // pid namespace are valid. `PidNamespaceRef` is not `Send`, so we know it cannot be - // transferred to another thread (where it could potentially outlive the current - // `Task`). The caller needs to ensure that the PidNamespaceRef doesn't outlive the - // current task/thread. - task: unsafe { PidNamespace::from_ptr(pidns) }, - _not_send: NotThreadSafe, + // CAST: The layout of `struct task_struct` and `CurrentTask` is identical. + task: Task::current_raw().cast(), } } @@ -328,6 +275,70 @@ impl Task { } } +impl CurrentTask { + /// Access the address space of the current task. + /// + /// This function does not touch the refcount of the mm. + #[inline] + pub fn mm(&self) -> Option<&MmWithUser> { + // SAFETY: The `mm` field of `current` is not modified from other threads, so reading it is + // not a data race. + let mm = unsafe { (*self.as_ptr()).mm }; + + if mm.is_null() { + return None; + } + + // SAFETY: If `current->mm` is non-null, then it references a valid mm with a non-zero + // value of `mm_users`. Furthermore, the returned `&MmWithUser` borrows from this + // `CurrentTask`, so it cannot escape the scope in which the current pointer was obtained. + // + // This is safe even if `kthread_use_mm()`/`kthread_unuse_mm()` are used. There are two + // relevant cases: + // * If the `&CurrentTask` was created before `kthread_use_mm()`, then it cannot be + // accessed during the `kthread_use_mm()`/`kthread_unuse_mm()` scope due to the + // `NotThreadSafe` field of `CurrentTask`. + // * If the `&CurrentTask` was created within a `kthread_use_mm()`/`kthread_unuse_mm()` + // scope, then the `&CurrentTask` cannot escape that scope, so the returned `&MmWithUser` + // also cannot escape that scope. + // In either case, it's not possible to read `current->mm` and keep using it after the + // scope is ended with `kthread_unuse_mm()`. + Some(unsafe { MmWithUser::from_raw(mm) }) + } + + /// Access the pid namespace of the current task. + /// + /// This function does not touch the refcount of the namespace or use RCU protection. + /// + /// To access the pid namespace of another task, see [`Task::get_pid_ns`]. + #[doc(alias = "task_active_pid_ns")] + #[inline] + pub fn active_pid_ns(&self) -> Option<&PidNamespace> { + // SAFETY: It is safe to call `task_active_pid_ns` without RCU protection when calling it + // on the current task. + let active_ns = unsafe { bindings::task_active_pid_ns(self.as_ptr()) }; + + if active_ns.is_null() { + return None; + } + + // The lifetime of `PidNamespace` is bound to `Task` and `struct pid`. + // + // The `PidNamespace` of a `Task` doesn't ever change once the `Task` is alive. + // + // From system call context retrieving the `PidNamespace` for the current task is always + // safe and requires neither RCU locking nor a reference count to be held. Retrieving the + // `PidNamespace` after `release_task()` for current will return `NULL` but no codepath + // like that is exposed to Rust. + // + // SAFETY: If `current`'s pid ns is non-null, then it references a valid pid ns. + // Furthermore, the returned `&PidNamespace` borrows from this `CurrentTask`, so it cannot + // escape the scope in which the current pointer was obtained, e.g. it cannot live past a + // `release_task()` call. + Some(unsafe { PidNamespace::from_ptr(active_ns) }) + } +} + // SAFETY: The type invariants guarantee that `Task` is always refcounted. unsafe impl crate::types::AlwaysRefCounted for Task { fn inc_ref(&self) { diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c index c3acbdab7a620..056b1b21a0fe9 100644 --- a/samples/damon/prcl.c +++ b/samples/damon/prcl.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * proactive reclamation: monitor access pattern of a given process, find - * regiosn that seems not accessed, and proactively page out the regions. + * regions that seems not accessed, and proactively page out the regions. */ #define pr_fmt(fmt) "damon_sample_prcl: " fmt diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c index 69105596e72e6..fadc2a85cb35d 100644 --- a/samples/livepatch/livepatch-callbacks-busymod.c +++ b/samples/livepatch/livepatch-callbacks-busymod.c @@ -56,4 +56,5 @@ static void livepatch_callbacks_mod_exit(void) module_init(livepatch_callbacks_mod_init); module_exit(livepatch_callbacks_mod_exit); +MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks, support module"); MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c index 11c3f4357812d..9e69d9caed258 100644 --- a/samples/livepatch/livepatch-callbacks-demo.c +++ b/samples/livepatch/livepatch-callbacks-demo.c @@ -192,5 +192,6 @@ static void livepatch_callbacks_demo_exit(void) module_init(livepatch_callbacks_demo_init); module_exit(livepatch_callbacks_demo_exit); +MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks"); MODULE_LICENSE("GPL"); MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c index 2a074f422a51d..d1851b471ad91 100644 --- a/samples/livepatch/livepatch-callbacks-mod.c +++ b/samples/livepatch/livepatch-callbacks-mod.c @@ -38,4 +38,5 @@ static void livepatch_callbacks_mod_exit(void) module_init(livepatch_callbacks_mod_init); module_exit(livepatch_callbacks_mod_exit); +MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks, support module"); MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c index cd76d7ebe5985..5263a2f31c480 100644 --- a/samples/livepatch/livepatch-sample.c +++ b/samples/livepatch/livepatch-sample.c @@ -66,5 +66,6 @@ static void livepatch_exit(void) module_init(livepatch_init); module_exit(livepatch_exit); +MODULE_DESCRIPTION("Kernel Live Patching Sample Module"); MODULE_LICENSE("GPL"); MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c index f3f153895d6ce..cbf68ca400973 100644 --- a/samples/livepatch/livepatch-shadow-fix1.c +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -168,5 +168,6 @@ static void livepatch_shadow_fix1_exit(void) module_init(livepatch_shadow_fix1_init); module_exit(livepatch_shadow_fix1_exit); +MODULE_DESCRIPTION("Live patching demo for shadow variables"); MODULE_LICENSE("GPL"); MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c index 361046a4f10cf..b99122cb221fc 100644 --- a/samples/livepatch/livepatch-shadow-fix2.c +++ b/samples/livepatch/livepatch-shadow-fix2.c @@ -128,5 +128,6 @@ static void livepatch_shadow_fix2_exit(void) module_init(livepatch_shadow_fix2_init); module_exit(livepatch_shadow_fix2_exit); +MODULE_DESCRIPTION("Live patching demo for shadow variables"); MODULE_LICENSE("GPL"); MODULE_INFO(livepatch, "Y"); diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index fbaaec2187e53..db76335dd9176 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -23,6 +23,8 @@ else # Switch to using --btf_features for v1.26 and later. pahole-flags-$(call test-ge, $(pahole-ver), 126) = -j$(JOBS) --btf_features=encode_force,var,float,enum64,decl_tag,type_tag,optimized_func,consistent_func,decl_tag_kfuncs +pahole-flags-$(call test-ge, $(pahole-ver), 130) += --btf_features=attributes + ifneq ($(KBUILD_EXTMOD),) module-pahole-flags-$(call test-ge, $(pahole-ver), 128) += --btf_features=distilled_base endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 28705ae677849..71d5ac83cf5da 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1506,7 +1506,7 @@ union bpf_attr { __s32 map_token_fd; }; - struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ __u32 map_fd; __aligned_u64 key; union { @@ -1995,11 +1995,15 @@ union bpf_attr { * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet - * associated to *skb*, at *offset*. *flags* are a combination of - * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the - * checksum for the packet after storing the bytes) and - * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ - * **->swhash** and *skb*\ **->l4hash** to 0). + * associated to *skb*, at *offset*. The *flags* are a combination + * of the following values: + * + * **BPF_F_RECOMPUTE_CSUM** + * Automatically update *skb*\ **->csum** after storing the + * bytes. + * **BPF_F_INVALIDATE_HASH** + * Set *skb*\ **->hash**, *skb*\ **->swhash** and *skb*\ + * **->l4hash** to 0. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -2051,7 +2055,7 @@ union bpf_attr { * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates - * the checksum is to be computed against a pseudo-header. + * that the modified header field is part of the pseudo-header. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 8a27bc5c7a7fe..24ddf7bc4f25f 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< sec_sz) { + if (sec_off + prog_sz > sec_sz || sec_off + prog_sz < sec_off) { pr_warn("sec '%s': program at offset %zu crosses section boundary\n", sec_name, sec_off); return -LIBBPF_ERRNO__FORMAT; @@ -1725,15 +1725,6 @@ static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *nam return ERR_PTR(-ENOENT); } -/* Some versions of Android don't provide memfd_create() in their libc - * implementation, so avoid complications and just go straight to Linux - * syscall. - */ -static int sys_memfd_create(const char *name, unsigned flags) -{ - return syscall(__NR_memfd_create, name, flags); -} - #ifndef MFD_CLOEXEC #define MFD_CLOEXEC 0x0001U #endif @@ -9455,6 +9446,30 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log return 0; } +struct bpf_func_info *bpf_program__func_info(const struct bpf_program *prog) +{ + if (prog->func_info_rec_size != sizeof(struct bpf_func_info)) + return libbpf_err_ptr(-EOPNOTSUPP); + return prog->func_info; +} + +__u32 bpf_program__func_info_cnt(const struct bpf_program *prog) +{ + return prog->func_info_cnt; +} + +struct bpf_line_info *bpf_program__line_info(const struct bpf_program *prog) +{ + if (prog->line_info_rec_size != sizeof(struct bpf_line_info)) + return libbpf_err_ptr(-EOPNOTSUPP); + return prog->line_info; +} + +__u32 bpf_program__line_info_cnt(const struct bpf_program *prog) +{ + return prog->line_info_cnt; +} + #define SEC_DEF(sec_pfx, ptype, atype, flags, ...) { \ .sec = (char *)sec_pfx, \ .prog_type = BPF_PROG_TYPE_##ptype, \ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e0605403f9773..d39f19c8396dc 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -940,6 +940,12 @@ LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_le LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size); LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size); +LIBBPF_API struct bpf_func_info *bpf_program__func_info(const struct bpf_program *prog); +LIBBPF_API __u32 bpf_program__func_info_cnt(const struct bpf_program *prog); + +LIBBPF_API struct bpf_line_info *bpf_program__line_info(const struct bpf_program *prog); +LIBBPF_API __u32 bpf_program__line_info_cnt(const struct bpf_program *prog); + /** * @brief **bpf_program__set_attach_target()** sets BTF-based attach target * for supported BPF program types: diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d8b71f22f197c..1205f9a4fe048 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -437,6 +437,10 @@ LIBBPF_1.6.0 { bpf_linker__add_fd; bpf_linker__new_fd; bpf_object__prepare; + bpf_program__func_info; + bpf_program__func_info_cnt; + bpf_program__line_info; + bpf_program__line_info_cnt; btf__add_decl_attr; btf__add_type_attr; } LIBBPF_1.5.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 76669c73dcd16..477a3b3389a09 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -667,6 +667,15 @@ static inline int sys_dup3(int oldfd, int newfd, int flags) return syscall(__NR_dup3, oldfd, newfd, flags); } +/* Some versions of Android don't provide memfd_create() in their libc + * implementation, so avoid complications and just go straight to Linux + * syscall. + */ +static inline int sys_memfd_create(const char *name, unsigned flags) +{ + return syscall(__NR_memfd_create, name, flags); +} + /* Point *fixed_fd* to the same file that *tmp_fd* points to. * Regardless of success, *tmp_fd* is closed. * Whatever *fixed_fd* pointed to is closed silently. diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 800e0ef09c378..56f5068e2ebab 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -573,7 +573,7 @@ int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz, snprintf(filename, sizeof(filename), "mem:%p+%zu", buf, buf_sz); - fd = memfd_create(filename, 0); + fd = sys_memfd_create(filename, 0); if (fd < 0) { ret = -errno; pr_warn("failed to create memfd '%s': %s\n", filename, errstr(ret)); diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index 68f1a75cd72c4..c55f67dd367d0 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -134,7 +134,7 @@ static int alloc_top_down_before_check(void) PREFIX_PUSH(); setup_memblock(); - memblock_reserve(memblock_end_of_DRAM() - total_size, r1_size); + memblock_reserve_kern(memblock_end_of_DRAM() - total_size, r1_size); allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); @@ -182,7 +182,7 @@ static int alloc_top_down_after_check(void) total_size = r1.size + r2_size; - memblock_reserve(r1.base, r1.size); + memblock_reserve_kern(r1.base, r1.size); allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); @@ -231,8 +231,8 @@ static int alloc_top_down_second_fit_check(void) total_size = r1.size + r2.size + r3_size; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); @@ -285,8 +285,8 @@ static int alloc_in_between_generic_check(void) total_size = r1.size + r2.size + r3_size; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); @@ -422,7 +422,7 @@ static int alloc_limited_space_generic_check(void) setup_memblock(); /* Simulate almost-full memory */ - memblock_reserve(memblock_start_of_DRAM(), reserved_size); + memblock_reserve_kern(memblock_start_of_DRAM(), reserved_size); allocated_ptr = run_memblock_alloc(available_size, SMP_CACHE_BYTES); @@ -608,7 +608,7 @@ static int alloc_bottom_up_before_check(void) PREFIX_PUSH(); setup_memblock(); - memblock_reserve(memblock_start_of_DRAM() + r1_size, r2_size); + memblock_reserve_kern(memblock_start_of_DRAM() + r1_size, r2_size); allocated_ptr = run_memblock_alloc(r1_size, SMP_CACHE_BYTES); @@ -655,7 +655,7 @@ static int alloc_bottom_up_after_check(void) total_size = r1.size + r2_size; - memblock_reserve(r1.base, r1.size); + memblock_reserve_kern(r1.base, r1.size); allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); @@ -705,8 +705,8 @@ static int alloc_bottom_up_second_fit_check(void) total_size = r1.size + r2.size + r3_size; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index 3ef9486da8a09..e5362cfd2ff30 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -163,7 +163,7 @@ static int alloc_from_top_down_no_space_above_check(void) min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; /* No space above this address */ - memblock_reserve(min_addr, r2_size); + memblock_reserve_kern(min_addr, r2_size); allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); @@ -199,7 +199,7 @@ static int alloc_from_top_down_min_addr_cap_check(void) start_addr = (phys_addr_t)memblock_start_of_DRAM(); min_addr = start_addr - SMP_CACHE_BYTES * 3; - memblock_reserve(start_addr + r1_size, MEM_SIZE - r1_size); + memblock_reserve_kern(start_addr + r1_size, MEM_SIZE - r1_size); allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 49bb416d34ffc..562e4701b0e02 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -324,7 +324,7 @@ static int alloc_nid_min_reserved_generic_check(void) min_addr = max_addr - r2_size; reserved_base = min_addr - r1_size; - memblock_reserve(reserved_base, r1_size); + memblock_reserve_kern(reserved_base, r1_size); allocated_ptr = run_memblock_alloc_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -374,7 +374,7 @@ static int alloc_nid_max_reserved_generic_check(void) max_addr = memblock_end_of_DRAM() - r1_size; min_addr = max_addr - r2_size; - memblock_reserve(max_addr, r1_size); + memblock_reserve_kern(max_addr, r1_size); allocated_ptr = run_memblock_alloc_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -436,8 +436,8 @@ static int alloc_nid_top_down_reserved_with_space_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -499,8 +499,8 @@ static int alloc_nid_reserved_full_merge_generic_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -563,8 +563,8 @@ static int alloc_nid_top_down_reserved_no_space_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, @@ -909,8 +909,8 @@ static int alloc_nid_bottom_up_reserved_with_space_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + memblock_reserve_kern(r1.base, r1.size); + memblock_reserve_kern(r2.base, r2.size); allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index bc30050227fda..2c0b383012533 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35475,15 +35475,65 @@ static void check_dfs_preorder(struct maple_tree *mt) } /* End of depth first search tests */ +/* get height of the lowest non-leaf node with free space */ +static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) +{ + struct ma_state *mas = wr_mas->mas; + char vacant_height = 0; + enum maple_type type; + unsigned long *pivots; + unsigned long min = 0; + unsigned long max = ULONG_MAX; + unsigned char offset; + + /* start traversal */ + mas_reset(mas); + mas_start(mas); + if (!xa_is_node(mas_root(mas))) + return 0; + + type = mte_node_type(mas->node); + wr_mas->type = type; + while (!ma_is_leaf(type)) { + mas_node_walk(mas, mte_to_node(mas->node), type, &min, &max); + offset = mas->offset; + mas->end = mas_data_end(mas); + pivots = ma_pivots(mte_to_node(mas->node), type); + + if (pivots) { + if (offset) + min = pivots[mas->offset - 1]; + if (offset < mas->end) + max = pivots[mas->offset]; + } + wr_mas->r_max = offset < mas->end ? pivots[offset] : mas->max; + + /* detect spanning write */ + if (mas_is_span_wr(wr_mas)) + break; + + if (mas->end < mt_slot_count(mas->node) - 1) + vacant_height = mas->depth + 1; + + mas_descend(mas); + type = mte_node_type(mas->node); + mas->depth++; + } + + return vacant_height; +} + /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) { unsigned long i, max = 100; unsigned long allocated; unsigned char height; + unsigned char vacant_height; struct maple_node *mn; void *ptr = check_prealloc; MA_STATE(mas, mt, 10, 20); + MA_WR_STATE(wr_mas, &mas, ptr); mt_set_non_kernel(1000); for (i = 0; i <= max; i++) @@ -35494,8 +35544,9 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); + vacant_height = get_vacant_height(&wr_mas, ptr); MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); @@ -35503,8 +35554,9 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); + vacant_height = get_vacant_height(&wr_mas, ptr); MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); @@ -35514,7 +35566,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mn->parent = ma_parent_ptr(mn); @@ -35527,7 +35580,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); @@ -35540,7 +35594,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); @@ -35553,7 +35608,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 3); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -35578,7 +35634,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1 + height * 2); + vacant_height = get_vacant_height(&wr_mas, ptr); + MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 2); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); @@ -35595,8 +35652,14 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); + vacant_height = get_vacant_height(&wr_mas, ptr); MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + /* + * vacant height cannot be used to compute the number of nodes needed + * as the root contains two entries which means it is on the verge of + * insufficiency. The worst case full height of the tree is needed. + */ + MT_BUG_ON(mt, allocated != height * 3 + 1); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mas_set_range(&mas, 0, 200); @@ -36248,6 +36311,45 @@ static noinline void __init check_mtree_dup(struct maple_tree *mt) extern void test_kmem_cache_bulk(void); +static inline void check_spanning_store_height(struct maple_tree *mt) +{ + int index = 0; + MA_STATE(mas, mt, 0, 0); + mas_lock(&mas); + while (mt_height(mt) != 3) { + mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL); + mas_set(&mas, ++index); + } + mas_set_range(&mas, 90, 140); + mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL); + MT_BUG_ON(mt, mas_mt_height(&mas) != 2); + mas_unlock(&mas); +} + +/* + * Test to check the path of a spanning rebalance which results in + * a collapse where the rebalancing of the child node leads to + * insufficieny in the parent node. + */ +static void check_collapsing_rebalance(struct maple_tree *mt) +{ + int i = 0; + MA_STATE(mas, mt, ULONG_MAX, ULONG_MAX); + + /* create a height 6 tree */ + while (mt_height(mt) < 6) { + mtree_store_range(mt, i, i + 10, xa_mk_value(i), GFP_KERNEL); + i += 9; + } + + /* delete all entries one at a time, starting from the right */ + do { + mas_erase(&mas); + } while (mas_prev(&mas, 0) != NULL); + + mtree_unlock(mt); +} + /* callback function used for check_nomem_writer_race() */ static void writer2(void *maple_tree) { @@ -36414,6 +36516,14 @@ void farmer_tests(void) check_spanning_write(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_spanning_store_height(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_collapsing_rebalance(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_null_expand(&tree); mtree_destroy(&tree); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 66bb50356be08..03663222a0a55 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -811,6 +811,7 @@ $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.ske $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h +$(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -831,6 +832,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ $(OUTPUT)/bench_bpf_crypto.o \ + $(OUTPUT)/bench_sockmap.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 1bd403a5ef7b3..ddd73d06a1eb2 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -283,6 +283,7 @@ extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; +extern struct argp bench_sockmap_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -297,6 +298,7 @@ static const struct argp_child bench_parsers[] = { { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, + { &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 }, {}, }; @@ -526,6 +528,12 @@ extern const struct bench bench_trig_uprobe_multi_push; extern const struct bench bench_trig_uretprobe_multi_push; extern const struct bench bench_trig_uprobe_multi_ret; extern const struct bench bench_trig_uretprobe_multi_ret; +#ifdef __x86_64__ +extern const struct bench bench_trig_uprobe_nop5; +extern const struct bench bench_trig_uretprobe_nop5; +extern const struct bench bench_trig_uprobe_multi_nop5; +extern const struct bench bench_trig_uretprobe_multi_nop5; +#endif extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; @@ -549,6 +557,7 @@ extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; +extern const struct bench bench_sockmap; static const struct bench *benchs[] = { &bench_count_global, @@ -586,6 +595,12 @@ static const struct bench *benchs[] = { &bench_trig_uretprobe_multi_push, &bench_trig_uprobe_multi_ret, &bench_trig_uretprobe_multi_ret, +#ifdef __x86_64__ + &bench_trig_uprobe_nop5, + &bench_trig_uretprobe_nop5, + &bench_trig_uprobe_multi_nop5, + &bench_trig_uretprobe_multi_nop5, +#endif /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, @@ -609,6 +624,7 @@ static const struct bench *benchs[] = { &bench_htab_mem, &bench_crypto_encrypt, &bench_crypto_decrypt, + &bench_sockmap, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c new file mode 100644 index 0000000000000..8ebf563a67a2b --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bench.h" +#include "bench_sockmap_prog.skel.h" + +#define FILE_SIZE (128 * 1024) +#define DATA_REPEAT_SIZE 10 + +static const char snd_data[DATA_REPEAT_SIZE] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + +/* c1 <-> [p1, p2] <-> c2 + * RX bench(BPF_SK_SKB_STREAM_VERDICT): + * ARG_FW_RX_PASS: + * send(p2) -> recv(c2) -> bpf skb passthrough -> recv(c2) + * ARG_FW_RX_VERDICT_EGRESS: + * send(c1) -> verdict skb to tx queuec of p2 -> recv(c2) + * ARG_FW_RX_VERDICT_INGRESS: + * send(c1) -> verdict skb to rx queuec of c2 -> recv(c2) + * + * TX bench(BPF_SK_MSG_VERDIC): + * ARG_FW_TX_PASS: + * send(p2) -> bpf msg passthrough -> send(p2) -> recv(c2) + * ARG_FW_TX_VERDICT_INGRESS: + * send(p2) -> verdict msg to rx queue of c2 -> recv(c2) + * ARG_FW_TX_VERDICT_EGRESS: + * send(p1) -> verdict msg to tx queue of p2 -> recv(c2) + */ +enum SOCKMAP_ARG_FLAG { + ARG_FW_RX_NORMAL = 11000, + ARG_FW_RX_PASS, + ARG_FW_RX_VERDICT_EGRESS, + ARG_FW_RX_VERDICT_INGRESS, + ARG_FW_TX_NORMAL, + ARG_FW_TX_PASS, + ARG_FW_TX_VERDICT_INGRESS, + ARG_FW_TX_VERDICT_EGRESS, + ARG_CTL_RX_STRP, + ARG_CONSUMER_DELAY_TIME, + ARG_PRODUCER_DURATION, +}; + +#define TXMODE_NORMAL() \ + ((ctx.mode) == ARG_FW_TX_NORMAL) + +#define TXMODE_BPF_INGRESS() \ + ((ctx.mode) == ARG_FW_TX_VERDICT_INGRESS) + +#define TXMODE_BPF_EGRESS() \ + ((ctx.mode) == ARG_FW_TX_VERDICT_EGRESS) + +#define TXMODE_BPF_PASS() \ + ((ctx.mode) == ARG_FW_TX_PASS) + +#define TXMODE_BPF() ( \ + TXMODE_BPF_PASS() || \ + TXMODE_BPF_INGRESS() || \ + TXMODE_BPF_EGRESS()) + +#define TXMODE() ( \ + TXMODE_NORMAL() || \ + TXMODE_BPF()) + +#define RXMODE_NORMAL() \ + ((ctx.mode) == ARG_FW_RX_NORMAL) + +#define RXMODE_BPF_PASS() \ + ((ctx.mode) == ARG_FW_RX_PASS) + +#define RXMODE_BPF_VERDICT_EGRESS() \ + ((ctx.mode) == ARG_FW_RX_VERDICT_EGRESS) + +#define RXMODE_BPF_VERDICT_INGRESS() \ + ((ctx.mode) == ARG_FW_RX_VERDICT_INGRESS) + +#define RXMODE_BPF_VERDICT() ( \ + RXMODE_BPF_VERDICT_INGRESS() || \ + RXMODE_BPF_VERDICT_EGRESS()) + +#define RXMODE_BPF() ( \ + RXMODE_BPF_PASS() || \ + RXMODE_BPF_VERDICT()) + +#define RXMODE() ( \ + RXMODE_NORMAL() || \ + RXMODE_BPF()) + +static struct socmap_ctx { + struct bench_sockmap_prog *skel; + enum SOCKMAP_ARG_FLAG mode; + #define c1 fds[0] + #define p1 fds[1] + #define c2 fds[2] + #define p2 fds[3] + #define sfd fds[4] + int fds[5]; + long send_calls; + long read_calls; + long prod_send; + long user_read; + int file_size; + int delay_consumer; + int prod_run_time; + int strp_size; +} ctx = { + .prod_send = 0, + .user_read = 0, + .file_size = FILE_SIZE, + .mode = ARG_FW_RX_VERDICT_EGRESS, + .fds = {0}, + .delay_consumer = 0, + .prod_run_time = 0, + .strp_size = 0, +}; + +static void bench_sockmap_prog_destroy(void) +{ + int i; + + for (i = 0; i < sizeof(ctx.fds); i++) { + if (ctx.fds[0] > 0) + close(ctx.fds[i]); + } + + bench_sockmap_prog__destroy(ctx.skel); +} + +static void init_addr(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); + + addr4->sin_family = AF_INET; + addr4->sin_port = 0; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + *len = sizeof(*addr4); +} + +static bool set_non_block(int fd, bool blocking) +{ + int flags = fcntl(fd, F_GETFL, 0); + + if (flags == -1) + return false; + flags = blocking ? (flags | O_NONBLOCK) : (flags & ~O_NONBLOCK); + return (fcntl(fd, F_SETFL, flags) == 0); +} + +static int create_pair(int *c, int *p, int type) +{ + struct sockaddr_storage addr; + int err, cfd, pfd; + socklen_t addr_len = sizeof(struct sockaddr_storage); + + err = getsockname(ctx.sfd, (struct sockaddr *)&addr, &addr_len); + if (err) { + fprintf(stderr, "getsockname error %d\n", errno); + return err; + } + cfd = socket(AF_INET, type, 0); + if (cfd < 0) { + fprintf(stderr, "socket error %d\n", errno); + return err; + } + + err = connect(cfd, (struct sockaddr *)&addr, addr_len); + if (err && errno != EINPROGRESS) { + fprintf(stderr, "connect error %d\n", errno); + return err; + } + + pfd = accept(ctx.sfd, NULL, NULL); + if (pfd < 0) { + fprintf(stderr, "accept error %d\n", errno); + return err; + } + *c = cfd; + *p = pfd; + return 0; +} + +static int create_sockets(void) +{ + struct sockaddr_storage addr; + int err, one = 1; + socklen_t addr_len; + + init_addr(&addr, &addr_len); + ctx.sfd = socket(AF_INET, SOCK_STREAM, 0); + if (ctx.sfd < 0) { + fprintf(stderr, "socket error:%d\n", errno); + return ctx.sfd; + } + err = setsockopt(ctx.sfd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + if (err) { + fprintf(stderr, "setsockopt error:%d\n", errno); + return err; + } + + err = bind(ctx.sfd, (struct sockaddr *)&addr, addr_len); + if (err) { + fprintf(stderr, "bind error:%d\n", errno); + return err; + } + + err = listen(ctx.sfd, SOMAXCONN); + if (err) { + fprintf(stderr, "listen error:%d\n", errno); + return err; + } + + err = create_pair(&ctx.c1, &ctx.p1, SOCK_STREAM); + if (err) { + fprintf(stderr, "create_pair 1 error\n"); + return err; + } + + err = create_pair(&ctx.c2, &ctx.p2, SOCK_STREAM); + if (err) { + fprintf(stderr, "create_pair 2 error\n"); + return err; + } + printf("create socket fd c1:%d p1:%d c2:%d p2:%d\n", + ctx.c1, ctx.p1, ctx.c2, ctx.p2); + return 0; +} + +static void validate(void) +{ + if (env.consumer_cnt != 2 || env.producer_cnt != 1 || + !env.affinity) + goto err; + return; +err: + fprintf(stderr, "argument '-c 2 -p 1 -a' is necessary"); + exit(1); +} + +static int setup_rx_sockmap(void) +{ + int verdict, pass, parser, map; + int zero = 0, one = 1; + int err; + + parser = bpf_program__fd(ctx.skel->progs.prog_skb_parser); + verdict = bpf_program__fd(ctx.skel->progs.prog_skb_verdict); + pass = bpf_program__fd(ctx.skel->progs.prog_skb_pass); + map = bpf_map__fd(ctx.skel->maps.sock_map_rx); + + if (ctx.strp_size != 0) { + ctx.skel->bss->pkt_size = ctx.strp_size; + err = bpf_prog_attach(parser, map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err) + return err; + } + + if (RXMODE_BPF_VERDICT()) + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + else if (RXMODE_BPF_PASS()) + err = bpf_prog_attach(pass, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) + return err; + + if (RXMODE_BPF_PASS()) + return bpf_map_update_elem(map, &zero, &ctx.c2, BPF_NOEXIST); + + err = bpf_map_update_elem(map, &zero, &ctx.p1, BPF_NOEXIST); + if (err < 0) + return err; + + if (RXMODE_BPF_VERDICT_INGRESS()) { + ctx.skel->bss->verdict_dir = BPF_F_INGRESS; + err = bpf_map_update_elem(map, &one, &ctx.c2, BPF_NOEXIST); + } else { + err = bpf_map_update_elem(map, &one, &ctx.p2, BPF_NOEXIST); + } + if (err < 0) + return err; + + return 0; +} + +static int setup_tx_sockmap(void) +{ + int zero = 0, one = 1; + int prog, map; + int err; + + map = bpf_map__fd(ctx.skel->maps.sock_map_tx); + prog = TXMODE_BPF_PASS() ? + bpf_program__fd(ctx.skel->progs.prog_skmsg_pass) : + bpf_program__fd(ctx.skel->progs.prog_skmsg_verdict); + + err = bpf_prog_attach(prog, map, BPF_SK_MSG_VERDICT, 0); + if (err) + return err; + + if (TXMODE_BPF_EGRESS()) { + err = bpf_map_update_elem(map, &zero, &ctx.p1, BPF_NOEXIST); + err |= bpf_map_update_elem(map, &one, &ctx.p2, BPF_NOEXIST); + } else { + ctx.skel->bss->verdict_dir = BPF_F_INGRESS; + err = bpf_map_update_elem(map, &zero, &ctx.p2, BPF_NOEXIST); + err |= bpf_map_update_elem(map, &one, &ctx.c2, BPF_NOEXIST); + } + + if (err < 0) + return err; + + return 0; +} + +static void setup(void) +{ + int err; + + ctx.skel = bench_sockmap_prog__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "error loading skel\n"); + exit(1); + } + + if (create_sockets()) { + fprintf(stderr, "create_net_mode error\n"); + goto err; + } + + if (RXMODE_BPF()) { + err = setup_rx_sockmap(); + if (err) { + fprintf(stderr, "setup_rx_sockmap error:%d\n", err); + goto err; + } + } else if (TXMODE_BPF()) { + err = setup_tx_sockmap(); + if (err) { + fprintf(stderr, "setup_tx_sockmap error:%d\n", err); + goto err; + } + } else { + fprintf(stderr, "unknown sockmap bench mode: %d\n", ctx.mode); + goto err; + } + + return; + +err: + bench_sockmap_prog_destroy(); + exit(1); +} + +static void measure(struct bench_res *res) +{ + res->drops = atomic_swap(&ctx.prod_send, 0); + res->hits = atomic_swap(&ctx.skel->bss->process_byte, 0); + res->false_hits = atomic_swap(&ctx.user_read, 0); + res->important_hits = atomic_swap(&ctx.send_calls, 0); + res->important_hits |= atomic_swap(&ctx.read_calls, 0) << 32; +} + +static void verify_data(int *check_pos, char *buf, int rcv) +{ + for (int i = 0 ; i < rcv; i++) { + if (buf[i] != snd_data[(*check_pos) % DATA_REPEAT_SIZE]) { + fprintf(stderr, "verify data fail"); + exit(1); + } + (*check_pos)++; + if (*check_pos >= FILE_SIZE) + *check_pos = 0; + } +} + +static void *consumer(void *input) +{ + int rcv, sent; + int check_pos = 0; + int tid = (long)input; + int recv_buf_size = FILE_SIZE; + char *buf = malloc(recv_buf_size); + int delay_read = ctx.delay_consumer; + + if (!buf) { + fprintf(stderr, "fail to init read buffer"); + return NULL; + } + + while (true) { + if (tid == 1) { + /* consumer 1 is unused for tx test and stream verdict test */ + if (RXMODE_BPF() || TXMODE()) + return NULL; + /* it's only for RX_NORMAL which service as reserve-proxy mode */ + rcv = read(ctx.p1, buf, recv_buf_size); + if (rcv < 0) { + fprintf(stderr, "fail to read p1"); + return NULL; + } + + sent = send(ctx.p2, buf, recv_buf_size, 0); + if (sent < 0) { + fprintf(stderr, "fail to send p2"); + return NULL; + } + } else { + if (delay_read != 0) { + if (delay_read < 0) + return NULL; + sleep(delay_read); + delay_read = 0; + } + /* read real endpoint by consumer 0 */ + atomic_inc(&ctx.read_calls); + rcv = read(ctx.c2, buf, recv_buf_size); + if (rcv < 0 && errno != EAGAIN) { + fprintf(stderr, "%s fail to read c2 %d\n", __func__, errno); + return NULL; + } + verify_data(&check_pos, buf, rcv); + atomic_add(&ctx.user_read, rcv); + } + } + + return NULL; +} + +static void *producer(void *input) +{ + int off = 0, fp, need_sent, sent; + int file_size = ctx.file_size; + struct timespec ts1, ts2; + int target; + FILE *file; + + file = tmpfile(); + if (!file) { + fprintf(stderr, "create file for sendfile"); + return NULL; + } + + /* we need simple verify */ + for (int i = 0; i < file_size; i++) { + if (fwrite(&snd_data[off], sizeof(char), 1, file) != 1) { + fprintf(stderr, "init tmpfile error"); + return NULL; + } + if (++off >= sizeof(snd_data)) + off = 0; + } + fflush(file); + fseek(file, 0, SEEK_SET); + + fp = fileno(file); + need_sent = file_size; + clock_gettime(CLOCK_MONOTONIC, &ts1); + + if (RXMODE_BPF_VERDICT()) + target = ctx.c1; + else if (TXMODE_BPF_EGRESS()) + target = ctx.p1; + else + target = ctx.p2; + set_non_block(target, true); + while (true) { + if (ctx.prod_run_time) { + clock_gettime(CLOCK_MONOTONIC, &ts2); + if (ts2.tv_sec - ts1.tv_sec > ctx.prod_run_time) + return NULL; + } + + errno = 0; + atomic_inc(&ctx.send_calls); + sent = sendfile(target, fp, NULL, need_sent); + if (sent < 0) { + if (errno != EAGAIN && errno != ENOMEM && errno != ENOBUFS) { + fprintf(stderr, "sendfile return %d, errorno %d:%s\n", + sent, errno, strerror(errno)); + return NULL; + } + continue; + } else if (sent < need_sent) { + need_sent -= sent; + atomic_add(&ctx.prod_send, sent); + continue; + } + atomic_add(&ctx.prod_send, need_sent); + need_sent = file_size; + lseek(fp, 0, SEEK_SET); + } + + return NULL; +} + +static void report_progress(int iter, struct bench_res *res, long delta_ns) +{ + double speed_mbs, prod_mbs, bpf_mbs, send_hz, read_hz; + + prod_mbs = res->drops / 1000000.0 / (delta_ns / 1000000000.0); + speed_mbs = res->false_hits / 1000000.0 / (delta_ns / 1000000000.0); + bpf_mbs = res->hits / 1000000.0 / (delta_ns / 1000000000.0); + send_hz = (res->important_hits & 0xFFFFFFFF) / (delta_ns / 1000000000.0); + read_hz = (res->important_hits >> 32) / (delta_ns / 1000000000.0); + + printf("Iter %3d (%7.3lfus): ", + iter, (delta_ns - 1000000000) / 1000.0); + printf("Send Speed %8.3lf MB/s (%8.3lf calls/s), BPF Speed %8.3lf MB/s, " + "Rcv Speed %8.3lf MB/s (%8.3lf calls/s)\n", + prod_mbs, send_hz, bpf_mbs, speed_mbs, read_hz); +} + +static void report_final(struct bench_res res[], int res_cnt) +{ + double verdict_mbs_mean = 0.0; + long verdict_total = 0; + int i; + + for (i = 0; i < res_cnt; i++) { + verdict_mbs_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + verdict_total += res[i].hits / 1000000.0; + } + + printf("Summary: total trans %8.3lu MB \u00B1 %5.3lf MB/s\n", + verdict_total, verdict_mbs_mean); +} + +static const struct argp_option opts[] = { + { "rx-normal", ARG_FW_RX_NORMAL, NULL, 0, + "simple reserve-proxy mode, no bfp enabled"}, + { "rx-pass", ARG_FW_RX_PASS, NULL, 0, + "run bpf prog but no redir applied"}, + { "rx-strp", ARG_CTL_RX_STRP, "Byte", 0, + "enable strparser and set the encapsulation size"}, + { "rx-verdict-egress", ARG_FW_RX_VERDICT_EGRESS, NULL, 0, + "forward data with bpf(stream verdict)"}, + { "rx-verdict-ingress", ARG_FW_RX_VERDICT_INGRESS, NULL, 0, + "forward data with bpf(stream verdict)"}, + { "tx-normal", ARG_FW_TX_NORMAL, NULL, 0, + "simple c-s mode, no bfp enabled"}, + { "tx-pass", ARG_FW_TX_PASS, NULL, 0, + "run bpf prog but no redir applied"}, + { "tx-verdict-ingress", ARG_FW_TX_VERDICT_INGRESS, NULL, 0, + "forward msg to ingress queue of another socket"}, + { "tx-verdict-egress", ARG_FW_TX_VERDICT_EGRESS, NULL, 0, + "forward msg to egress queue of another socket"}, + { "delay-consumer", ARG_CONSUMER_DELAY_TIME, "SEC", 0, + "delay consumer start"}, + { "producer-duration", ARG_PRODUCER_DURATION, "SEC", 0, + "producer duration"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_FW_RX_NORMAL...ARG_FW_TX_VERDICT_EGRESS: + ctx.mode = key; + break; + case ARG_CONSUMER_DELAY_TIME: + ctx.delay_consumer = strtol(arg, NULL, 10); + break; + case ARG_PRODUCER_DURATION: + ctx.prod_run_time = strtol(arg, NULL, 10); + break; + case ARG_CTL_RX_STRP: + ctx.strp_size = strtol(arg, NULL, 10); + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +/* exported into benchmark runner */ +const struct argp bench_sockmap_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* Benchmark performance of creating bpf local storage */ +const struct bench bench_sockmap = { + .name = "sockmap", + .argp = &bench_sockmap_argp, + .validate = validate, + .setup = setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = report_progress, + .report_final = report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 32e9f194d4497..82327657846e5 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -333,6 +333,20 @@ static void *uprobe_producer_ret(void *input) return NULL; } +#ifdef __x86_64__ +__nocf_check __weak void uprobe_target_nop5(void) +{ + asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00"); +} + +static void *uprobe_producer_nop5(void *input) +{ + while (true) + uprobe_target_nop5(); + return NULL; +} +#endif + static void usetup(bool use_retprobe, bool use_multi, void *target_addr) { size_t uprobe_offset; @@ -448,6 +462,28 @@ static void uretprobe_multi_ret_setup(void) usetup(true, true /* use_multi */, &uprobe_target_ret); } +#ifdef __x86_64__ +static void uprobe_nop5_setup(void) +{ + usetup(false, false /* !use_multi */, &uprobe_target_nop5); +} + +static void uretprobe_nop5_setup(void) +{ + usetup(true, false /* !use_multi */, &uprobe_target_nop5); +} + +static void uprobe_multi_nop5_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_nop5); +} + +static void uretprobe_multi_nop5_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_nop5); +} +#endif + const struct bench bench_trig_syscall_count = { .name = "trig-syscall-count", .validate = trigger_validate, @@ -506,3 +542,9 @@ BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret"); BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop"); BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push"); BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret"); +#ifdef __x86_64__ +BENCH_TRIG_USERMODE(uprobe_nop5, nop5, "uprobe-nop5"); +BENCH_TRIG_USERMODE(uretprobe_nop5, nop5, "uretprobe-nop5"); +BENCH_TRIG_USERMODE(uprobe_multi_nop5, nop5, "uprobe-multi-nop5"); +BENCH_TRIG_USERMODE(uretprobe_multi_nop5, nop5, "uretprobe-multi-nop5"); +#endif diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index af169f831f2f2..03f55405484b2 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret,nop5} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h index fb8dc07689998..4e29c31c4ef80 100644 --- a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h @@ -95,9 +95,6 @@ struct arena_qnode { #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) - struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; static inline u32 encode_tail(int cpu, int idx) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index dbd13f8e42a7a..dd6512fa652be 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -63,6 +63,12 @@ static void test_bpf_nf_ct(int mode) .repeat = 1, ); + if (SYS_NOFAIL("iptables-legacy --version")) { + fprintf(stdout, "Missing required iptables-legacy tool\n"); + test__skip(); + return; + } + skel = test_bpf_nf__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_bpf_nf__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c b/tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c new file mode 100644 index 0000000000000..ca46fdd6e1ae7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fd_htab_lookup.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include "fd_htab_lookup.skel.h" + +struct htab_op_ctx { + int fd; + int loop; + unsigned int entries; + bool stop; +}; + +#define ERR_TO_RETVAL(where, err) ((void *)(long)(((where) << 12) | (-err))) + +static void *htab_lookup_fn(void *arg) +{ + struct htab_op_ctx *ctx = arg; + int i = 0; + + while (i++ < ctx->loop && !ctx->stop) { + unsigned int j; + + for (j = 0; j < ctx->entries; j++) { + unsigned int key = j, zero = 0, value; + int inner_fd, err; + + err = bpf_map_lookup_elem(ctx->fd, &key, &value); + if (err) { + ctx->stop = true; + return ERR_TO_RETVAL(1, err); + } + + inner_fd = bpf_map_get_fd_by_id(value); + if (inner_fd < 0) { + /* The old map has been freed */ + if (inner_fd == -ENOENT) + continue; + ctx->stop = true; + return ERR_TO_RETVAL(2, inner_fd); + } + + err = bpf_map_lookup_elem(inner_fd, &zero, &value); + if (err) { + close(inner_fd); + ctx->stop = true; + return ERR_TO_RETVAL(3, err); + } + close(inner_fd); + + if (value != key) { + ctx->stop = true; + return ERR_TO_RETVAL(4, -EINVAL); + } + } + } + + return NULL; +} + +static void *htab_update_fn(void *arg) +{ + struct htab_op_ctx *ctx = arg; + int i = 0; + + while (i++ < ctx->loop && !ctx->stop) { + unsigned int j; + + for (j = 0; j < ctx->entries; j++) { + unsigned int key = j, zero = 0; + int inner_fd, err; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); + if (inner_fd < 0) { + ctx->stop = true; + return ERR_TO_RETVAL(1, inner_fd); + } + + err = bpf_map_update_elem(inner_fd, &zero, &key, 0); + if (err) { + close(inner_fd); + ctx->stop = true; + return ERR_TO_RETVAL(2, err); + } + + err = bpf_map_update_elem(ctx->fd, &key, &inner_fd, BPF_EXIST); + if (err) { + close(inner_fd); + ctx->stop = true; + return ERR_TO_RETVAL(3, err); + } + close(inner_fd); + } + } + + return NULL; +} + +static int setup_htab(int fd, unsigned int entries) +{ + unsigned int i; + + for (i = 0; i < entries; i++) { + unsigned int key = i, zero = 0; + int inner_fd, err; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); + if (!ASSERT_OK_FD(inner_fd, "new array")) + return -1; + + err = bpf_map_update_elem(inner_fd, &zero, &key, 0); + if (!ASSERT_OK(err, "init array")) { + close(inner_fd); + return -1; + } + + err = bpf_map_update_elem(fd, &key, &inner_fd, 0); + if (!ASSERT_OK(err, "init outer")) { + close(inner_fd); + return -1; + } + close(inner_fd); + } + + return 0; +} + +static int get_int_from_env(const char *name, int dft) +{ + const char *value; + + value = getenv(name); + if (!value) + return dft; + + return atoi(value); +} + +void test_fd_htab_lookup(void) +{ + unsigned int i, wr_nr = 8, rd_nr = 16; + pthread_t tids[wr_nr + rd_nr]; + struct fd_htab_lookup *skel; + struct htab_op_ctx ctx; + int err; + + skel = fd_htab_lookup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fd_htab_lookup__open_and_load")) + return; + + ctx.fd = bpf_map__fd(skel->maps.outer_map); + ctx.loop = get_int_from_env("FD_HTAB_LOOP_NR", 5); + ctx.stop = false; + ctx.entries = 8; + + err = setup_htab(ctx.fd, ctx.entries); + if (err) + goto destroy; + + memset(tids, 0, sizeof(tids)); + for (i = 0; i < wr_nr; i++) { + err = pthread_create(&tids[i], NULL, htab_update_fn, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + ctx.stop = true; + goto reap; + } + } + for (i = 0; i < rd_nr; i++) { + err = pthread_create(&tids[i + wr_nr], NULL, htab_lookup_fn, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + ctx.stop = true; + goto reap; + } + } + +reap: + for (i = 0; i < wr_nr + rd_nr; i++) { + void *ret = NULL; + char desc[32]; + + if (!tids[i]) + continue; + + snprintf(desc, sizeof(desc), "thread %u", i + 1); + err = pthread_join(tids[i], &ret); + ASSERT_OK(err, desc); + ASSERT_EQ(ret, NULL, desc); + } +destroy: + fd_htab_lookup__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index 2d0796314862a..49b85c1c75521 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -3,13 +3,64 @@ /* * Tests for sockmap/sockhash holding kTLS sockets. */ - +#include #include +#include #include "test_progs.h" +#include "sockmap_helpers.h" +#include "test_skmsg_load_helpers.skel.h" +#include "test_sockmap_ktls.skel.h" #define MAX_TEST_NAME 80 #define TCP_ULP 31 +static int init_ktls_pairs(int c, int p) +{ + int err; + struct tls12_crypto_info_aes_gcm_128 crypto_rx; + struct tls12_crypto_info_aes_gcm_128 crypto_tx; + + err = setsockopt(c, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); + if (!ASSERT_OK(err, "setsockopt(TCP_ULP)")) + goto out; + + err = setsockopt(p, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); + if (!ASSERT_OK(err, "setsockopt(TCP_ULP)")) + goto out; + + memset(&crypto_rx, 0, sizeof(crypto_rx)); + memset(&crypto_tx, 0, sizeof(crypto_tx)); + crypto_rx.info.version = TLS_1_2_VERSION; + crypto_tx.info.version = TLS_1_2_VERSION; + crypto_rx.info.cipher_type = TLS_CIPHER_AES_GCM_128; + crypto_tx.info.cipher_type = TLS_CIPHER_AES_GCM_128; + + err = setsockopt(c, SOL_TLS, TLS_TX, &crypto_tx, sizeof(crypto_tx)); + if (!ASSERT_OK(err, "setsockopt(TLS_TX)")) + goto out; + + err = setsockopt(p, SOL_TLS, TLS_RX, &crypto_rx, sizeof(crypto_rx)); + if (!ASSERT_OK(err, "setsockopt(TLS_RX)")) + goto out; + return 0; +out: + return -1; +} + +static int create_ktls_pairs(int family, int sotype, int *c, int *p) +{ + int err; + + err = create_pair(family, sotype, c, p); + if (!ASSERT_OK(err, "create_pair()")) + return -1; + + err = init_ktls_pairs(*c, *p); + if (!ASSERT_OK(err, "init_ktls_pairs(c, p)")) + return -1; + return 0; +} + static int tcp_server(int family) { int err, s; @@ -146,6 +197,115 @@ static const char *fmt_test_name(const char *subtest_name, int family, return test_name; } +static void test_sockmap_ktls_offload(int family, int sotype) +{ + int err; + int c = 0, p = 0, sent, recvd; + char msg[12] = "hello world\0"; + char rcv[13]; + + err = create_ktls_pairs(family, sotype, &c, &p); + if (!ASSERT_OK(err, "create_ktls_pairs()")) + goto out; + + sent = send(c, msg, sizeof(msg), 0); + if (!ASSERT_OK(err, "send(msg)")) + goto out; + + recvd = recv(p, rcv, sizeof(rcv), 0); + if (!ASSERT_OK(err, "recv(msg)") || + !ASSERT_EQ(recvd, sent, "length mismatch")) + goto out; + + ASSERT_OK(memcmp(msg, rcv, sizeof(msg)), "data mismatch"); + +out: + if (c) + close(c); + if (p) + close(p); +} + +static void test_sockmap_ktls_tx_cork(int family, int sotype, bool push) +{ + int err, off; + int i, j; + int start_push = 0, push_len = 0; + int c = 0, p = 0, one = 1, sent, recvd; + int prog_fd, map_fd; + char msg[12] = "hello world\0"; + char rcv[20] = {0}; + struct test_sockmap_ktls *skel; + + skel = test_sockmap_ktls__open_and_load(); + if (!ASSERT_TRUE(skel, "open ktls skel")) + return; + + err = create_pair(family, sotype, &c, &p); + if (!ASSERT_OK(err, "create_pair()")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.prog_sk_policy); + map_fd = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(prog_fd, map_fd, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach sk msg")) + goto out; + + err = bpf_map_update_elem(map_fd, &one, &c, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(c)")) + goto out; + + err = init_ktls_pairs(c, p); + if (!ASSERT_OK(err, "init_ktls_pairs(c, p)")) + goto out; + + skel->bss->cork_byte = sizeof(msg); + if (push) { + start_push = 1; + push_len = 2; + } + skel->bss->push_start = start_push; + skel->bss->push_end = push_len; + + off = sizeof(msg) / 2; + sent = send(c, msg, off, 0); + if (!ASSERT_EQ(sent, off, "send(msg)")) + goto out; + + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(-1, recvd, "expected no data")) + goto out; + + /* send remaining msg */ + sent = send(c, msg + off, sizeof(msg) - off, 0); + if (!ASSERT_EQ(sent, sizeof(msg) - off, "send remaining data")) + goto out; + + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_OK(err, "recv(msg)") || + !ASSERT_EQ(recvd, sizeof(msg) + push_len, "check length mismatch")) + goto out; + + for (i = 0, j = 0; i < recvd;) { + /* skip checking the data that has been pushed in */ + if (i >= start_push && i <= start_push + push_len - 1) { + i++; + continue; + } + if (!ASSERT_EQ(rcv[i], msg[j], "data mismatch")) + goto out; + i++; + j++; + } +out: + if (c) + close(c); + if (p) + close(p); + test_sockmap_ktls__destroy(skel); +} + static void run_tests(int family, enum bpf_map_type map_type) { int map; @@ -162,10 +322,22 @@ static void run_tests(int family, enum bpf_map_type map_type) close(map); } +static void run_ktls_test(int family, int sotype) +{ + if (test__start_subtest("tls simple offload")) + test_sockmap_ktls_offload(family, sotype); + if (test__start_subtest("tls tx cork")) + test_sockmap_ktls_tx_cork(family, sotype, false); + if (test__start_subtest("tls tx cork with push")) + test_sockmap_ktls_tx_cork(family, sotype, true); +} + void test_sockmap_ktls(void) { run_tests(AF_INET, BPF_MAP_TYPE_SOCKMAP); run_tests(AF_INET, BPF_MAP_TYPE_SOCKHASH); run_tests(AF_INET6, BPF_MAP_TYPE_SOCKMAP); run_tests(AF_INET6, BPF_MAP_TYPE_SOCKHASH); + run_ktls_test(AF_INET, SOCK_STREAM); + run_ktls_test(AF_INET6, SOCK_STREAM); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_btf_ext.c b/tools/testing/selftests/bpf/prog_tests/test_btf_ext.c new file mode 100644 index 0000000000000..7d1b478c99a09 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_btf_ext.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms Inc. */ +#include +#include "test_btf_ext.skel.h" +#include "btf_helpers.h" + +static void subtest_line_func_info(void) +{ + struct test_btf_ext *skel; + struct bpf_prog_info info; + struct bpf_line_info line_info[128], *libbpf_line_info; + struct bpf_func_info func_info[128], *libbpf_func_info; + __u32 info_len = sizeof(info), libbbpf_line_info_cnt, libbbpf_func_info_cnt; + int err, fd; + + skel = test_btf_ext__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + fd = bpf_program__fd(skel->progs.global_func); + + memset(&info, 0, sizeof(info)); + info.line_info = ptr_to_u64(&line_info); + info.nr_line_info = sizeof(line_info); + info.line_info_rec_size = sizeof(*line_info); + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "prog_line_info")) + goto out; + + libbpf_line_info = bpf_program__line_info(skel->progs.global_func); + libbbpf_line_info_cnt = bpf_program__line_info_cnt(skel->progs.global_func); + + memset(&info, 0, sizeof(info)); + info.func_info = ptr_to_u64(&func_info); + info.nr_func_info = sizeof(func_info); + info.func_info_rec_size = sizeof(*func_info); + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "prog_func_info")) + goto out; + + libbpf_func_info = bpf_program__func_info(skel->progs.global_func); + libbbpf_func_info_cnt = bpf_program__func_info_cnt(skel->progs.global_func); + + if (!ASSERT_OK_PTR(libbpf_line_info, "bpf_program__line_info")) + goto out; + if (!ASSERT_EQ(libbbpf_line_info_cnt, info.nr_line_info, "line_info_cnt")) + goto out; + if (!ASSERT_OK_PTR(libbpf_func_info, "bpf_program__func_info")) + goto out; + if (!ASSERT_EQ(libbbpf_func_info_cnt, info.nr_func_info, "func_info_cnt")) + goto out; + ASSERT_MEMEQ(libbpf_line_info, line_info, libbbpf_line_info_cnt * sizeof(*line_info), + "line_info"); + ASSERT_MEMEQ(libbpf_func_info, func_info, libbbpf_func_info_cnt * sizeof(*func_info), + "func_info"); +out: + test_btf_ext__destroy(skel); +} + +void test_btf_ext(void) +{ + if (test__start_subtest("line_func_info")) + subtest_line_func_info(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index a95b42bf744a2..47b56c258f3f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -63,6 +63,9 @@ static void test_set_global_vars_succeeds(void) " -G \"var_eb = EB2\" "\ " -G \"var_ec = EC2\" "\ " -G \"var_b = 1\" "\ + " -G \"struct1.struct2.u.var_u8 = 170\" "\ + " -G \"union1.struct3.var_u8_l = 0xaa\" "\ + " -G \"union1.struct3.var_u8_h = 0xaa\" "\ "-vl2 > %s", fix->veristat, fix->tmpfile); read(fix->fd, fix->output, fix->sz); @@ -78,6 +81,8 @@ static void test_set_global_vars_succeeds(void) __CHECK_STR("_w=12 ", "var_eb = EB2"); __CHECK_STR("_w=13 ", "var_ec = EC2"); __CHECK_STR("_w=1 ", "var_b = 1"); + __CHECK_STR("_w=170 ", "struct1.struct2.u.var_u8 = 170"); + __CHECK_STR("_w=0xaaaa ", "union1.var_u16 = 0xaaaa"); out: teardown_fixture(fix); diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index c397336fe1ed1..5e7e61a332fce 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -14,16 +14,21 @@ #include #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "sdt.h" -__naked unsigned long uretprobe_regs_trigger(void) +#pragma GCC diagnostic ignored "-Wattributes" + +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( - "movq $0xdeadbeef, %rax\n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "movq $0xdeadbeef, %rax \n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -44,15 +49,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +99,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +138,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +152,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -146,6 +165,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +203,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +211,16 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -277,10 +308,10 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->links.test_uretprobe_multi = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(skel->links.test_uretprobe_multi, "bpf_program__attach_uprobe_multi")) goto cleanup; /* kick the child */ @@ -301,6 +332,261 @@ static void test_uretprobe_syscall_call(void) close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + STAP_PROBE(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void **start, void **end) +{ + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", start, end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static bool shstk_is_enabled; + +static void check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + void *tramp_start, *tramp_end; + struct __arch_relative_insn { + u8 op; + s32 raddr; + } __packed *call; + s32 delta; + u8 *bp; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + if (!shstk_is_enabled && + !ASSERT_OK(find_uprobes_trampoline(&tramp_start, &tramp_end), "uprobes_trampoline")) + return; + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + if (shstk_is_enabled) { + /* .. and check optimization is disabled under shadow stack. */ + bp = (u8 *) addr; + ASSERT_EQ(*bp, 0xcc, "int3"); + } else { + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + delta = (unsigned long) tramp_start - ((unsigned long) addr + 5); + + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_EQ(call->raddr, delta, "delta"); + ASSERT_EQ(tramp_end - tramp_start, 4096, "size"); + } +} + +static void check_detach(struct uprobe_syscall_executed *skel, trigger_t trigger, void *addr) +{ + void *tramp_start, *tramp_end; + + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(find_uprobes_trampoline(&tramp_start, &tramp_end), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(skel, trigger, addr); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -343,43 +629,164 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + /* Run all the tests with shadow stack in place. */ + shstk_is_enabled = true; + + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + test_regs_change(); + + shstk_is_enabled = false; + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +static volatile bool race_stop; + +static void *worker_trigger(void *arg) { - test__skip(); + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; } -static void test_uretprobe_regs_change(void) +static void *worker_attach(void *arg) { - test__skip(); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + return NULL; } -static void test_uretprobe_syscall_call(void) +static void test_uprobe_race(void) { - test__skip(); + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GE(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + + threads = malloc(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + sleep(4); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); } -static void test_uretprobe_shadow_stack(void) +#ifndef __NR_uprobe +#define __NR_uprobe 336 +#endif + +static void test_uprobe_sigill(void) { - test__skip(); + int status, err, pid; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork")) + return; + /* child */ + if (pid == 0) { + asm volatile ( + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "popq %r11\n" + "popq %rcx\n" + "retq\n" + ); + exit(0); + } + + err = waitpid(pid, &status, 0); + ASSERT_EQ(err, pid, "waitpid"); + + /* verify the child got killed with SIGILL */ + ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); + ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); } -#endif -void test_uprobe_syscall(void) +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_sigill")) + test_uprobe_sigill(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 495d66414b57e..3a5b5230bfa07 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -419,7 +427,11 @@ static void subtest_urandom_usdt(bool auto_attach) void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c b/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c new file mode 100644 index 0000000000000..079bf3794b3a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +long process_byte = 0; +int verdict_dir = 0; +int dropped = 0; +int pkt_size = 0; +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 20); + __type(key, int); + __type(value, int); +} sock_map_rx SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 20); + __type(key, int); + __type(value, int); +} sock_map_tx SEC(".maps"); + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return pkt_size; +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + int one = 1; + int ret = bpf_sk_redirect_map(skb, &sock_map_rx, one, verdict_dir); + + if (ret == SK_DROP) + dropped++; + __sync_fetch_and_add(&process_byte, skb->len); + return ret; +} + +SEC("sk_skb/stream_verdict") +int prog_skb_pass(struct __sk_buff *skb) +{ + __sync_fetch_and_add(&process_byte, skb->len); + return SK_PASS; +} + +SEC("sk_msg") +int prog_skmsg_verdict(struct sk_msg_md *msg) +{ + int one = 1; + + __sync_fetch_and_add(&process_byte, msg->size); + return bpf_msg_redirect_map(msg, &sock_map_tx, one, verdict_dir); +} + +SEC("sk_msg") +int prog_skmsg_pass(struct sk_msg_md *msg) +{ + __sync_fetch_and_add(&process_byte, msg->size); + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/fd_htab_lookup.c b/tools/testing/selftests/bpf/progs/fd_htab_lookup.c new file mode 100644 index 0000000000000..a4a9e1db626fc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fd_htab_lookup.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct inner_map_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 64); + __type(key, int); + __type(value, int); + __array(values, struct inner_map_type); +} outer_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 427b72954b87e..76adf4a8f2dad 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -7,8 +7,6 @@ #include "bpf_misc.h" #include "bpf_compiler.h" -#define unlikely(x) __builtin_expect(!!(x), 0) - static volatile int zero = 0; int my_pid; diff --git a/tools/testing/selftests/bpf/progs/prepare.c b/tools/testing/selftests/bpf/progs/prepare.c index 1f1dd547e4ee2..cfc1f48e0d28d 100644 --- a/tools/testing/selftests/bpf/progs/prepare.c +++ b/tools/testing/selftests/bpf/progs/prepare.c @@ -2,7 +2,6 @@ /* Copyright (c) 2025 Meta */ #include #include -//#include char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/set_global_vars.c b/tools/testing/selftests/bpf/progs/set_global_vars.c index 9adb5ba4cd4d7..90f5656c39911 100644 --- a/tools/testing/selftests/bpf/progs/set_global_vars.c +++ b/tools/testing/selftests/bpf/progs/set_global_vars.c @@ -24,6 +24,44 @@ const volatile enum Enumu64 var_eb = EB1; const volatile enum Enums64 var_ec = EC1; const volatile bool var_b = false; +struct Struct { + int:16; + __u16 filler; + struct { + const __u16 filler2; + }; + struct Struct2 { + __u16 filler; + volatile struct { + const int:1; + union { + const volatile __u8 var_u8; + const volatile __s16 filler3; + const int:1; + } u; + }; + } struct2; +}; + +const volatile __u32 stru = 0; /* same prefix as below */ +const volatile struct Struct struct1 = {.struct2 = {.u = {.var_u8 = 1}}}; + +union Union { + __u16 var_u16; + struct Struct3 { + struct { + __u8 var_u8_l; + }; + struct { + struct { + __u8 var_u8_h; + }; + }; + } struct3; +}; + +const volatile union Union union1 = {.var_u16 = -1}; + char arr[4] = {0}; SEC("socket") @@ -43,5 +81,8 @@ int test_set_globals(void *ctx) a = var_eb; a = var_ec; a = var_b; + a = struct1.struct2.u.var_u8; + a = union1.var_u16; + return a; } diff --git a/tools/testing/selftests/bpf/progs/test_btf_ext.c b/tools/testing/selftests/bpf/progs/test_btf_ext.c new file mode 100644 index 0000000000000..cdf20331db045 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_btf_ext.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms Inc. */ + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +__noinline static void f0(void) +{ + __u64 a = 1; + + __sink(a); +} + +SEC("xdp") +__u64 global_func(struct xdp_md *xdp) +{ + f0(); + return XDP_DROP; +} diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c b/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c new file mode 100644 index 0000000000000..e0f757929ef4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +int cork_byte; +int push_start; +int push_end; + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 20); + __type(key, int); + __type(value, int); +} sock_map SEC(".maps"); + +SEC("sk_msg") +int prog_sk_policy(struct sk_msg_md *msg) +{ + if (cork_byte > 0) + bpf_msg_cork_bytes(msg, cork_byte); + if (push_start > 0 && push_end > 0) + bpf_msg_push_data(msg, push_start, push_end, 0); + + return SK_PASS; +} diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef590..e08c31669e5a7 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2e..7bb4338c3ee2e 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include +#include +#include #include struct pt_regs regs; @@ -9,9 +11,44 @@ char _license[] SEC("license") = "GPL"; int executed = 0; +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + executed++; + return 0; +} + SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + executed++; + return 0; +} + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) +{ + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) { - executed = 1; + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h index 1fcfa5160231d..1d62c06f5ddcb 100644 --- a/tools/testing/selftests/bpf/sdt.h +++ b/tools/testing/selftests/bpf/sdt.h @@ -236,6 +236,13 @@ __extension__ extern unsigned long long __sdt_unsp; #define _SDT_NOP nop #endif +/* Use 5 byte nop for x86_64 to allow optimizing uprobes. */ +#if defined(__x86_64__) +# define _SDT_DEF_NOP _SDT_ASM_5(990: .byte 0x0f, 0x1f, 0x44, 0x00, 0x00) +#else +# define _SDT_DEF_NOP _SDT_ASM_1(990: _SDT_NOP) +#endif + #define _SDT_NOTE_NAME "stapsdt" #define _SDT_NOTE_TYPE 3 @@ -288,7 +295,7 @@ __extension__ extern unsigned long long __sdt_unsp; #define _SDT_ASM_BODY(provider, name, pack_args, args, ...) \ _SDT_DEF_MACROS \ - _SDT_ASM_1(990: _SDT_NOP) \ + _SDT_DEF_NOP \ _SDT_ASM_3( .pushsection .note.stapsdt,_SDT_ASM_AUTOGROUP,"note") \ _SDT_ASM_1( .balign 4) \ _SDT_ASM_3( .4byte 992f-991f, 994f-993f, _SDT_NOTE_TYPE) \ diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 3220f1d28697e..5a3dc463ace59 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -496,15 +496,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { */ #ifdef __x86_64__ +static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -516,6 +522,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; @@ -1340,7 +1347,7 @@ static int st_ops_gen_prologue_with_kfunc(struct bpf_insn *insn_buf, bool direct *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a)); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 2); *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_0); - *insn++ = BPF_CALL_KFUNC(0, bpf_cgroup_release_id), + *insn++ = BPF_CALL_KFUNC(0, bpf_cgroup_release_id); *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_8); *insn++ = prog->insnsi[0]; @@ -1379,7 +1386,7 @@ static int st_ops_gen_epilogue_with_kfunc(struct bpf_insn *insn_buf, const struc *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a)); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 2); *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_0); - *insn++ = BPF_CALL_KFUNC(0, bpf_cgroup_release_id), + *insn++ = BPF_CALL_KFUNC(0, bpf_cgroup_release_id); *insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6); *insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2); *insn++ = BPF_EXIT_INSN(); diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index a18972ffdeb6f..b2bb20b009524 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -1486,7 +1486,84 @@ static bool is_preset_supported(const struct btf_type *t) return btf_is_int(t) || btf_is_enum(t) || btf_is_enum64(t); } -static int set_global_var(struct bpf_object *obj, struct btf *btf, const struct btf_type *t, +const int btf_find_member(const struct btf *btf, + const struct btf_type *parent_type, + __u32 parent_offset, + const char *member_name, + int *member_tid, + __u32 *member_offset) +{ + int i; + + if (!btf_is_composite(parent_type)) + return -EINVAL; + + for (i = 0; i < btf_vlen(parent_type); ++i) { + const struct btf_member *member; + const struct btf_type *member_type; + int tid; + + member = btf_members(parent_type) + i; + tid = btf__resolve_type(btf, member->type); + if (tid < 0) + return -EINVAL; + + member_type = btf__type_by_id(btf, tid); + if (member->name_off) { + const char *name = btf__name_by_offset(btf, member->name_off); + + if (strcmp(member_name, name) == 0) { + if (btf_member_bitfield_size(parent_type, i) != 0) { + fprintf(stderr, "Bitfield presets are not supported %s\n", + name); + return -EINVAL; + } + *member_offset = parent_offset + member->offset; + *member_tid = tid; + return 0; + } + } else if (btf_is_composite(member_type)) { + int err; + + err = btf_find_member(btf, member_type, parent_offset + member->offset, + member_name, member_tid, member_offset); + if (!err) + return 0; + } + } + + return -EINVAL; +} + +static int adjust_var_secinfo(struct btf *btf, const struct btf_type *t, + struct btf_var_secinfo *sinfo, const char *var) +{ + char expr[256], *saveptr; + const struct btf_type *base_type, *member_type; + int err, member_tid; + char *name; + __u32 member_offset = 0; + + base_type = btf__type_by_id(btf, btf__resolve_type(btf, t->type)); + snprintf(expr, sizeof(expr), "%s", var); + strtok_r(expr, ".", &saveptr); + + while ((name = strtok_r(NULL, ".", &saveptr))) { + err = btf_find_member(btf, base_type, 0, name, &member_tid, &member_offset); + if (err) { + fprintf(stderr, "Could not find member %s for variable %s\n", name, var); + return err; + } + member_type = btf__type_by_id(btf, member_tid); + sinfo->offset += member_offset / 8; + sinfo->size = member_type->size; + sinfo->type = member_tid; + base_type = member_type; + } + return 0; +} + +static int set_global_var(struct bpf_object *obj, struct btf *btf, struct bpf_map *map, struct btf_var_secinfo *sinfo, struct var_preset *preset) { @@ -1495,9 +1572,9 @@ static int set_global_var(struct bpf_object *obj, struct btf *btf, const struct long long value = preset->ivalue; size_t size; - base_type = btf__type_by_id(btf, btf__resolve_type(btf, t->type)); + base_type = btf__type_by_id(btf, btf__resolve_type(btf, sinfo->type)); if (!base_type) { - fprintf(stderr, "Failed to resolve type %d\n", t->type); + fprintf(stderr, "Failed to resolve type %d\n", sinfo->type); return -EINVAL; } if (!is_preset_supported(base_type)) { @@ -1530,7 +1607,7 @@ static int set_global_var(struct bpf_object *obj, struct btf *btf, const struct if (value >= max_val || value < -max_val) { fprintf(stderr, "Variable %s value %lld is out of range [%lld; %lld]\n", - btf__name_by_offset(btf, t->name_off), value, + btf__name_by_offset(btf, base_type->name_off), value, is_signed ? -max_val : 0, max_val - 1); return -EINVAL; } @@ -1583,14 +1660,20 @@ static int set_global_vars(struct bpf_object *obj, struct var_preset *presets, i for (j = 0; j < n; ++j, ++sinfo) { const struct btf_type *var_type = btf__type_by_id(btf, sinfo->type); const char *var_name; + int var_len; if (!btf_is_var(var_type)) continue; var_name = btf__name_by_offset(btf, var_type->name_off); + var_len = strlen(var_name); for (k = 0; k < npresets; ++k) { - if (strcmp(var_name, presets[k].name) != 0) + struct btf_var_secinfo tmp_sinfo; + + if (strncmp(var_name, presets[k].name, var_len) != 0 || + (presets[k].name[var_len] != '\0' && + presets[k].name[var_len] != '.')) continue; if (presets[k].applied) { @@ -1598,13 +1681,17 @@ static int set_global_vars(struct bpf_object *obj, struct var_preset *presets, i var_name); return -EINVAL; } + tmp_sinfo = *sinfo; + err = adjust_var_secinfo(btf, var_type, + &tmp_sinfo, presets[k].name); + if (err) + return err; - err = set_global_var(obj, btf, var_type, map, sinfo, presets + k); + err = set_global_var(obj, btf, map, &tmp_sinfo, presets + k); if (err) return err; presets[k].applied = true; - break; } } } diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index ecbf07afc6dd0..ff21524be4580 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -3,7 +3,7 @@ TEST_GEN_FILES += access_memory access_memory_even -TEST_FILES = _chk_dependency.sh _damon_sysfs.py +TEST_FILES = _damon_sysfs.py # functionality tests TEST_PROGS += sysfs.sh diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh deleted file mode 100644 index dda3a87dc00a2..0000000000000 --- a/tools/testing/selftests/damon/_chk_dependency.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}') -if [ "$DBGFS" = "" ] -then - echo "debugfs not mounted" - exit $ksft_skip -fi - -DBGFS+="/damon" - -if [ $EUID -ne 0 ]; -then - echo "Run as root" - exit $ksft_skip -fi - -if [ ! -d "$DBGFS" ] -then - echo "$DBGFS not found" - exit $ksft_skip -fi - -if [ -f "$DBGFS/monitor_on_DEPRECATED" ] -then - monitor_on_file="monitor_on_DEPRECATED" -else - monitor_on_file="monitor_on" -fi - -for f in attrs target_ids "$monitor_on_file" -do - if [ ! -f "$DBGFS/$f" ] - then - echo "$f not found" - exit 1 - fi -done - -permission_error="Operation not permitted" -for f in attrs target_ids "$monitor_on_file" -do - status=$( cat "$DBGFS/$f" 2>&1 ) - if [ "${status#*$permission_error}" != "$status" ]; then - echo "Permission for reading $DBGFS/$f denied; maybe secureboot enabled?" - exit $ksft_skip - fi -done diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh deleted file mode 100644 index 54d45791b0d9c..0000000000000 --- a/tools/testing/selftests/damon/_debugfs_common.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -test_write_result() { - file=$1 - content=$2 - orig_content=$3 - expect_reason=$4 - expected=$5 - - if [ "$expected" = "0" ] - then - echo "$content" > "$file" - else - echo "$content" > "$file" 2> /dev/null - fi - if [ $? -ne "$expected" ] - then - echo "writing $content to $file doesn't return $expected" - echo "expected because: $expect_reason" - echo "$orig_content" > "$file" - exit 1 - fi -} - -test_write_succ() { - test_write_result "$1" "$2" "$3" "$4" 0 -} - -test_write_fail() { - test_write_result "$1" "$2" "$3" "$4" 1 -} - -test_content() { - file=$1 - orig_content=$2 - expected=$3 - expect_reason=$4 - - content=$(cat "$file") - if [ "$content" != "$expected" ] - then - echo "reading $file expected $expected but $content" - echo "expected because: $expect_reason" - echo "$orig_content" > "$file" - exit 1 - fi -} - -source ./_chk_dependency.sh - -damon_onoff="$DBGFS/monitor_on" -if [ -f "$DBGFS/monitor_on_DEPRECATED" ] -then - damon_onoff="$DBGFS/monitor_on_DEPRECATED" -else - damon_onoff="$DBGFS/monitor_on" -fi - -if [ $(cat "$damon_onoff") = "on" ] -then - echo "monitoring is on" - exit $ksft_skip -fi diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index efabfcbe0b498..17ed3e9917ca1 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -283,8 +283,7 @@ TEST(check_file_mmap) /* * Test mincore() behavior on a page backed by a tmpfs file. This test - * performs the same steps as the previous one. However, we don't expect - * any readahead in this case. + * performs the same steps as the previous one. */ TEST(check_tmpfs_mmap) { @@ -295,7 +294,6 @@ TEST(check_tmpfs_mmap) int page_size; int fd; int i; - int ra_pages = 0; page_size = sysconf(_SC_PAGESIZE); vec_size = FILE_SIZE / page_size; @@ -338,8 +336,7 @@ TEST(check_tmpfs_mmap) } /* - * Touch a page in the middle of the mapping. We expect only - * that page to be fetched into memory. + * Touch a page in the middle of the mapping. */ addr[FILE_SIZE / 2] = 1; retval = mincore(addr, FILE_SIZE, vec); @@ -348,15 +345,6 @@ TEST(check_tmpfs_mmap) TH_LOG("Page not found in memory after use"); } - i = FILE_SIZE / 2 / page_size + 1; - while (i < vec_size && vec[i]) { - ra_pages++; - i++; - } - ASSERT_EQ(ra_pages, 0) { - TH_LOG("Read-ahead pages found in memory"); - } - munmap(addr, FILE_SIZE); close(fd); free(vec); diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c5241b193db8c..91db34941a143 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -58,3 +58,4 @@ hugetlb_dio pkey_sighandler_tests_32 pkey_sighandler_tests_64 guard-regions +merge diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 8270895039d1b..ad4d6043a60f0 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -98,6 +98,7 @@ TEST_GEN_FILES += hugetlb_madv_vs_map TEST_GEN_FILES += hugetlb_dio TEST_GEN_FILES += droppable TEST_GEN_FILES += guard-regions +TEST_GEN_FILES += merge ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index 67df7b47087f0..e1fe16bcbbe88 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -29,7 +29,7 @@ fi if [[ $cgroup2 ]]; then cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}') if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory + cgroup_path=$(mktemp -d) mount -t cgroup2 none $cgroup_path do_umount=1 fi @@ -37,7 +37,7 @@ if [[ $cgroup2 ]]; then else cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}') if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory + cgroup_path=$(mktemp -d) mount -t cgroup memory,hugetlb $cgroup_path do_umount=1 fi diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index f0cb14ea86084..b6cfe0a4b7dfd 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -293,7 +293,7 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size, .iov_base = mem, .iov_len = size, }; - ssize_t cur, total, transferred; + ssize_t cur, total, transferred = 0; struct comm_pipes comm_pipes; char *old, *new; int ret, fds[2]; diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index b3d0e27710961..c39dd26c47a38 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -2071,4 +2072,60 @@ TEST_F(guard_regions, pagemap) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +/* + * Assert that PAGEMAP_SCAN correctly reports guard region ranges. + */ +TEST_F(guard_regions, pagemap_scan) +{ + const unsigned long page_size = self->page_size; + struct page_region pm_regs[10]; + struct pm_scan_arg pm_scan_args = { + .size = sizeof(struct pm_scan_arg), + .category_anyof_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + .vec = (long)&pm_regs, + .vec_len = ARRAY_SIZE(pm_regs), + }; + int proc_fd, i; + char *ptr; + + proc_fd = open("/proc/self/pagemap", O_RDONLY); + ASSERT_NE(proc_fd, -1); + + ptr = mmap_(self, variant, NULL, 10 * page_size, + PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + pm_scan_args.start = (long)ptr; + pm_scan_args.end = (long)ptr + 10 * page_size; + ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 0); + ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size); + + /* Install a guard region in every other page. */ + for (i = 0; i < 10; i += 2) { + char *ptr_p = &ptr[i * page_size]; + + ASSERT_EQ(syscall(__NR_madvise, ptr_p, page_size, MADV_GUARD_INSTALL), 0); + } + + /* + * Assert ioctl() returns the count of located regions, where each + * region spans every other page within the range of 10 pages. + */ + ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 5); + ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size); + + /* Re-read from pagemap, and assert guard regions are detected. */ + for (i = 0; i < 5; i++) { + long ptr_p = (long)&ptr[2 * i * page_size]; + + ASSERT_EQ(pm_regs[i].start, ptr_p); + ASSERT_EQ(pm_regs[i].end, ptr_p + page_size); + ASSERT_EQ(pm_regs[i].categories, PAGE_IS_GUARD); + } + + ASSERT_EQ(close(proc_fd), 0); + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index 11f9bbe7dc222..0dd31892ff679 100755 --- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -23,7 +23,7 @@ fi if [[ $cgroup2 ]]; then CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk '{print $3}') if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory + CGROUP_ROOT=$(mktemp -d) mount -t cgroup2 none $CGROUP_ROOT do_umount=1 fi @@ -36,7 +36,7 @@ else do_umount=1 fi fi -MNT='/mnt/huge/' +MNT='/mnt/huge' function get_machine_hugepage_size() { hpz=$(grep -i hugepagesize /proc/meminfo) @@ -56,10 +56,45 @@ function cleanup() { rmdir "$CGROUP_ROOT"/a/b 2>/dev/null rmdir "$CGROUP_ROOT"/a 2>/dev/null rmdir "$CGROUP_ROOT"/test1 2>/dev/null - echo 0 >/proc/sys/vm/nr_hugepages + echo $nr_hugepgs >/proc/sys/vm/nr_hugepages set -e } +function assert_with_retry() { + local actual_path="$1" + local expected="$2" + local tolerance=$((7 * 1024 * 1024)) + local timeout=20 + local interval=1 + local start_time + local now + local elapsed + local actual + + start_time=$(date +%s) + + while true; do + actual="$(cat "$actual_path")" + + if [[ $actual -ge $(($expected - $tolerance)) ]] && + [[ $actual -le $(($expected + $tolerance)) ]]; then + return 0 + fi + + now=$(date +%s) + elapsed=$((now - start_time)) + + if [[ $elapsed -ge $timeout ]]; then + echo "actual = $((${actual%% *} / 1024 / 1024)) MB" + echo "expected = $((${expected%% *} / 1024 / 1024)) MB" + cleanup + exit 1 + fi + + sleep $interval + done +} + function assert_state() { local expected_a="$1" local expected_a_hugetlb="$2" @@ -70,58 +105,13 @@ function assert_state() { expected_b="$3" expected_b_hugetlb="$4" fi - local tolerance=$((5 * 1024 * 1024)) - - local actual_a - actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" - if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || - [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then - echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB - echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - local actual_a_hugetlb - actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || - [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then - echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB - echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB - echo fail + assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a" + assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" "$expected_a_hugetlb" - cleanup - exit 1 - fi - - if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then - return - fi - - local actual_b - actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" - if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || - [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then - echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB - echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_b_hugetlb - actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || - [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then - echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB - echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 + if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then + assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b" + assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" "$expected_b_hugetlb" fi } @@ -174,7 +164,6 @@ size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. cleanup -echo echo echo Test charge, rmdir, uncharge setup @@ -195,7 +184,6 @@ cleanup echo done echo -echo if [[ ! $cgroup2 ]]; then echo "Test parent and child hugetlb usage" setup @@ -212,7 +200,6 @@ if [[ ! $cgroup2 ]]; then assert_state 0 $(($size * 2)) 0 $size rmdir "$CGROUP_ROOT"/a/b - sleep 5 echo Assert memory reparent correctly. assert_state 0 $(($size * 2)) @@ -224,7 +211,6 @@ if [[ ! $cgroup2 ]]; then cleanup fi -echo echo echo "Test child only hugetlb usage" echo setup diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c new file mode 100644 index 0000000000000..c76646cdf6e6f --- /dev/null +++ b/tools/testing/selftests/mm/merge.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include "vm_util.h" + +FIXTURE(merge) +{ + unsigned int page_size; + char *carveout; + struct procmap_fd procmap; +}; + +FIXTURE_SETUP(merge) +{ + self->page_size = psize(); + /* Carve out PROT_NONE region to map over. */ + self->carveout = mmap(NULL, 12 * self->page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(self->carveout, MAP_FAILED); + /* Setup PROCMAP_QUERY interface. */ + ASSERT_EQ(open_self_procmap(&self->procmap), 0); +} + +FIXTURE_TEARDOWN(merge) +{ + ASSERT_EQ(munmap(self->carveout, 12 * self->page_size), 0); + ASSERT_EQ(close_procmap(&self->procmap), 0); +} + +TEST_F(merge, mprotect_unfaulted_left) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * Map 10 pages of R/W memory within. MAP_NORESERVE so we don't hit + * merge failure due to lack of VM_ACCOUNT flag by mistake. + * + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the first 5 pages read-only, splitting the VMA: + * + * RO RW + * |-----------|-----------| + * | unfaulted | unfaulted | + * |-----------|-----------| + */ + ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0); + /* + * Fault in the first of the last 5 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RO RW + * |-----------|-----------| + * | unfaulted | faulted | + * |-----------|-----------| + */ + ptr[5 * page_size] = 'x'; + /* + * Now mprotect() the RW region read-only, we should merge (though for + * ~15 years we did not! :): + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size); +} + +TEST_F(merge, mprotect_unfaulted_right) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the last 5 pages read-only, splitting the VMA: + * + * RW RO + * |-----------|-----------| + * | unfaulted | unfaulted | + * |-----------|-----------| + */ + ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0); + /* + * Fault in the first of the first 5 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RW RO + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + */ + ptr[0] = 'x'; + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size); +} + +TEST_F(merge, mprotect_unfaulted_both) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the first and last 3 pages read-only, splitting the VMA: + * + * RO RW RO + * |-----------|-----------|-----------| + * | unfaulted | unfaulted | unfaulted | + * |-----------|-----------|-----------| + */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0); + /* + * Fault in the first of the middle 3 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RO RW RO + * |-----------|-----------|-----------| + * | unfaulted | faulted | unfaulted | + * |-----------|-----------|-----------| + */ + ptr[3 * page_size] = 'x'; + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size); +} + +TEST_F(merge, mprotect_faulted_left_unfaulted_right) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the last 3 pages read-only, splitting the VMA: + * + * RW RO + * |-----------------------|-----------| + * | unfaulted | unfaulted | + * |-----------------------|-----------| + */ + ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0); + /* + * Fault in the first of the first 6 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RW RO + * |-----------------------|-----------| + * | unfaulted | unfaulted | + * |-----------------------|-----------| + */ + ptr[0] = 'x'; + /* + * Now make the first 3 pages read-only, splitting the VMA: + * + * RO RW RO + * |-----------|-----------|-----------| + * | faulted | faulted | unfaulted | + * |-----------|-----------|-----------| + */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size); +} + +TEST_F(merge, mprotect_unfaulted_left_faulted_right) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + char *ptr; + + /* + * |-----------------------| + * | unfaulted | + * |-----------------------| + */ + ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* + * Now make the first 3 pages read-only, splitting the VMA: + * + * RO RW + * |-----------|-----------------------| + * | unfaulted | unfaulted | + * |-----------|-----------------------| + */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + /* + * Fault in the first of the last 6 pages so it gets an anon_vma and + * thus the whole VMA becomes 'faulted': + * + * RO RW + * |-----------|-----------------------| + * | unfaulted | faulted | + * |-----------|-----------------------| + */ + ptr[3 * page_size] = 'x'; + /* + * Now make the last 3 pages read-only, splitting the VMA: + * + * RO RW RO + * |-----------|-----------|-----------| + * | unfaulted | faulted | faulted | + * |-----------|-----------|-----------| + */ + ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0); + /* + * Now mprotect() the RW region read-only, we should merge: + * + * RO + * |-----------------------| + * | faulted | + * |-----------------------| + */ + ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0); + + /* Assert that the merge succeeded using PROCMAP_QUERY. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size); +} + +TEST_F(merge, forked_target_vma) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + pid_t pid; + char *ptr, *ptr2; + int i; + + /* + * |-----------| + * | unfaulted | + * |-----------| + */ + ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* + * Fault in process. + * + * |-----------| + * | faulted | + * |-----------| + */ + ptr[0] = 'x'; + + pid = fork(); + ASSERT_NE(pid, -1); + + if (pid != 0) { + wait(NULL); + return; + } + + /* Child process below: */ + + /* Reopen for child. */ + ASSERT_EQ(close_procmap(&self->procmap), 0); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); + + /* unCOWing everything does not cause the AVC to go away. */ + for (i = 0; i < 5 * page_size; i += page_size) + ptr[i] = 'x'; + + /* + * Map in adjacent VMA in child. + * + * forked + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + * ptr ptr2 + */ + ptr2 = mmap(&ptr[5 * page_size], 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + + /* Make sure not merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 5 * page_size); +} + +TEST_F(merge, forked_source_vma) +{ + unsigned int page_size = self->page_size; + char *carveout = self->carveout; + struct procmap_fd *procmap = &self->procmap; + pid_t pid; + char *ptr, *ptr2; + int i; + + /* + * |-----------|------------| + * | unfaulted | | + * |-----------|------------| + */ + ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* + * Fault in process. + * + * |-----------|------------| + * | faulted | | + * |-----------|------------| + */ + ptr[0] = 'x'; + + pid = fork(); + ASSERT_NE(pid, -1); + + if (pid != 0) { + wait(NULL); + return; + } + + /* Child process below: */ + + /* Reopen for child. */ + ASSERT_EQ(close_procmap(&self->procmap), 0); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); + + /* unCOWing everything does not cause the AVC to go away. */ + for (i = 0; i < 5 * page_size; i += page_size) + ptr[i] = 'x'; + + /* + * Map in adjacent VMA in child, ptr2 after ptr, but incompatible. + * + * forked RW RWX + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + * ptr ptr2 + */ + ptr2 = mmap(&carveout[6 * page_size], 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + + /* Make sure not merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr2)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size); + + /* + * Now mprotect forked region to RWX so it becomes the source for the + * merge to unfaulted region: + * + * forked RWX RWX + * |-----------|-----------| + * | faulted | unfaulted | + * |-----------|-----------| + * ptr ptr2 + * + * This should NOT result in a merge, as ptr was forked. + */ + ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC), 0); + /* Again, make sure not merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr2)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 57b4bba2b45f3..fe5ae8b25ff61 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -34,7 +34,7 @@ #define PAGEMAP "/proc/self/pagemap" int pagemap_fd; int uffd; -unsigned int page_size; +unsigned long page_size; unsigned int hpage_size; const char *progname; @@ -184,7 +184,7 @@ void *gethugetlb_mem(int size, int *shmid) int userfaultfd_tests(void) { - int mem_size, vec_size, written, num_pages = 16; + long mem_size, vec_size, written, num_pages = 16; char *mem, *vec; mem_size = num_pages * page_size; @@ -213,7 +213,7 @@ int userfaultfd_tests(void) written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (written < 0) - ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", written, errno, strerror(errno)); ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__); @@ -995,7 +995,7 @@ int unmapped_region_tests(void) { void *start = (void *)0x10000000; int written, len = 0x00040000; - int vec_size = len / page_size; + long vec_size = len / page_size; struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); /* 1. Get written pages */ @@ -1051,7 +1051,7 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - int ret, fd, i, buf_size; + long ret, fd, i, buf_size; struct page_region *vec; char *mem, *fmem; struct stat sbuf; @@ -1160,7 +1160,7 @@ int sanity_tests(void) ret = stat(progname, &sbuf); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 9aff33b109997..188b125bf1f6b 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -79,6 +79,8 @@ separated by spaces: test prctl(PR_SET_MDWE, ...) - page_frag test handling of page fragment allocation and freeing +- vma_merge + test VMA merge cases behave as expected example: ./run_vmtests.sh -t "hmm mmap ksm" EOF @@ -421,6 +423,8 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +CATEGORY="vma_merge" run_test ./merge + if [ -x ./memfd_secret ] then (echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index a36734fb62f38..1357e2d6a7b68 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -424,3 +425,64 @@ bool check_vmflag_io(void *addr) flags += flaglen; } } + +/* + * Open an fd at /proc/$pid/maps and configure procmap_out ready for + * PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise. + */ +int open_procmap(pid_t pid, struct procmap_fd *procmap_out) +{ + char path[256]; + int ret = 0; + + memset(procmap_out, '\0', sizeof(*procmap_out)); + sprintf(path, "/proc/%d/maps", pid); + procmap_out->query.size = sizeof(procmap_out->query); + procmap_out->fd = open(path, O_RDONLY); + if (procmap_out < 0) + ret = -errno; + + return ret; +} + +/* Perform PROCMAP_QUERY. Returns 0 on success, or an error code otherwise. */ +int query_procmap(struct procmap_fd *procmap) +{ + int ret = 0; + + if (ioctl(procmap->fd, PROCMAP_QUERY, &procmap->query) == -1) + ret = -errno; + + return ret; +} + +/* + * Try to find the VMA at specified address, returns true if found, false if not + * found, and the test is failed if any other error occurs. + * + * On success, procmap->query is populated with the results. + */ +bool find_vma_procmap(struct procmap_fd *procmap, void *address) +{ + int err; + + procmap->query.query_flags = 0; + procmap->query.query_addr = (unsigned long)address; + err = query_procmap(procmap); + if (!err) + return true; + + if (err != -ENOENT) + ksft_exit_fail_msg("%s: Error %d on ioctl(PROCMAP_QUERY)\n", + __func__, err); + return false; +} + +/* + * Close fd used by PROCMAP_QUERY mechanism. Returns 0 on success, or an error + * code otherwise. + */ +int close_procmap(struct procmap_fd *procmap) +{ + return close(procmap->fd); +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6effafdc4d8a2..9211ba640d9cd 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -6,6 +6,7 @@ #include /* ffsl() */ #include /* _SC_PAGESIZE */ #include "../kselftest.h" +#include #define BIT_ULL(nr) (1ULL << (nr)) #define PM_SOFT_DIRTY BIT_ULL(55) @@ -19,6 +20,15 @@ extern unsigned int __page_size; extern unsigned int __page_shift; +/* + * Represents an open fd and PROCMAP_QUERY state for binary (via ioctl) + * /proc/$pid/[s]maps lookup. + */ +struct procmap_fd { + int fd; + struct procmap_query query; +}; + static inline unsigned int psize(void) { if (!__page_size) @@ -73,6 +83,17 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor, uint64_t *ioctls); unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); +int open_procmap(pid_t pid, struct procmap_fd *procmap_out); +int query_procmap(struct procmap_fd *procmap); +bool find_vma_procmap(struct procmap_fd *procmap, void *address); +int close_procmap(struct procmap_fd *procmap); + +static inline int open_self_procmap(struct procmap_fd *procmap_out) +{ + pid_t pid = getpid(); + + return open_procmap(pid, procmap_out); +} /* * On ppc64 this will only work with radix 2M hugepage size diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index 1c631740a730a..c5e0b76ba6ac5 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES) -TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess get_set_sud +TEST_GEN_PROGS := get_syscall_info set_syscall_info peeksiginfo vmaccess get_set_sud include ../lib.mk diff --git a/tools/testing/selftests/ptrace/set_syscall_info.c b/tools/testing/selftests/ptrace/set_syscall_info.c new file mode 100644 index 0000000000000..4198248ef8749 --- /dev/null +++ b/tools/testing/selftests/ptrace/set_syscall_info.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2018-2025 Dmitry V. Levin + * All rights reserved. + * + * Check whether PTRACE_SET_SYSCALL_INFO semantics implemented in the kernel + * matches userspace expectations. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include + +#if defined(_MIPS_SIM) && _MIPS_SIM == _MIPS_SIM_NABI32 +/* + * MIPS N32 is the only architecture where __kernel_ulong_t + * does not match the bitness of syscall arguments. + */ +typedef unsigned long long kernel_ulong_t; +#else +typedef __kernel_ulong_t kernel_ulong_t; +#endif + +struct si_entry { + int nr; + kernel_ulong_t args[6]; +}; +struct si_exit { + unsigned int is_error; + int rval; +}; + +static unsigned int ptrace_stop; +static pid_t tracee_pid; + +static int +kill_tracee(pid_t pid) +{ + if (!pid) + return 0; + + int saved_errno = errno; + + int rc = kill(pid, SIGKILL); + + errno = saved_errno; + return rc; +} + +static long +sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data) +{ + return syscall(__NR_ptrace, request, pid, addr, data); +} + +#define LOG_KILL_TRACEE(fmt, ...) \ + do { \ + kill_tracee(tracee_pid); \ + TH_LOG("wait #%d: " fmt, \ + ptrace_stop, ##__VA_ARGS__); \ + } while (0) + +static void +check_psi_entry(struct __test_metadata *_metadata, + const struct ptrace_syscall_info *info, + const struct si_entry *exp_entry, + const char *text) +{ + unsigned int i; + int exp_nr = exp_entry->nr; +#if defined __s390__ || defined __s390x__ + /* s390 is the only architecture that has 16-bit syscall numbers */ + exp_nr &= 0xffff; +#endif + + ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info->op) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->arch) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->instruction_pointer) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->stack_pointer) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_EQ(exp_nr, info->entry.nr) { + LOG_KILL_TRACEE("%s: syscall nr mismatch", text); + } + for (i = 0; i < ARRAY_SIZE(exp_entry->args); ++i) { + ASSERT_EQ(exp_entry->args[i], info->entry.args[i]) { + LOG_KILL_TRACEE("%s: syscall arg #%u mismatch", + text, i); + } + } +} + +static void +check_psi_exit(struct __test_metadata *_metadata, + const struct ptrace_syscall_info *info, + const struct si_exit *exp_exit, + const char *text) +{ + ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info->op) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->arch) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->instruction_pointer) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->stack_pointer) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_EQ(exp_exit->is_error, info->exit.is_error) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_EQ(exp_exit->rval, info->exit.rval) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } +} + +TEST(set_syscall_info) +{ + const pid_t tracer_pid = getpid(); + const kernel_ulong_t dummy[] = { + (kernel_ulong_t) 0xdad0bef0bad0fed0ULL, + (kernel_ulong_t) 0xdad1bef1bad1fed1ULL, + (kernel_ulong_t) 0xdad2bef2bad2fed2ULL, + (kernel_ulong_t) 0xdad3bef3bad3fed3ULL, + (kernel_ulong_t) 0xdad4bef4bad4fed4ULL, + (kernel_ulong_t) 0xdad5bef5bad5fed5ULL, + }; + int splice_in[2], splice_out[2]; + + ASSERT_EQ(0, pipe(splice_in)); + ASSERT_EQ(0, pipe(splice_out)); + ASSERT_EQ(sizeof(dummy), write(splice_in[1], dummy, sizeof(dummy))); + + const struct { + struct si_entry entry[2]; + struct si_exit exit[2]; + } si[] = { + /* change scno, keep non-error rval */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 0, tracer_pid } + } + }, + + /* set scno to -1, keep error rval */ + { + { + { + __NR_chdir, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + -1, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 1, -ENOSYS }, { 1, -ENOSYS } + } + }, + + /* keep scno, change non-error rval */ + { + { + { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 0, tracer_pid + 1 } + } + }, + + /* change arg1, keep non-error rval */ + { + { + { + __NR_chdir, + { + (uintptr_t) "", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_chdir, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, 0 }, { 0, 0 } + } + }, + + /* set scno to -1, change error rval to non-error */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + -1, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 1, -ENOSYS }, { 0, tracer_pid } + } + }, + + /* change scno, change non-error rval to error */ + { + { + { + __NR_chdir, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 1, -EISDIR } + } + }, + + /* change scno and all args, change non-error rval */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_splice, + { + splice_in[0], 0, splice_out[1], 0, + sizeof(dummy), SPLICE_F_NONBLOCK + } + } + }, { + { 0, sizeof(dummy) }, { 0, sizeof(dummy) + 1 } + } + }, + + /* change arg1, no exit stop */ + { + { + { + __NR_exit_group, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_exit_group, + { + 0, dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, 0 }, { 0, 0 } + } + }, + }; + + long rc; + unsigned int i; + + tracee_pid = fork(); + + ASSERT_LE(0, tracee_pid) { + TH_LOG("fork: %m"); + } + + if (tracee_pid == 0) { + /* get the pid before PTRACE_TRACEME */ + tracee_pid = getpid(); + ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) { + TH_LOG("PTRACE_TRACEME: %m"); + } + ASSERT_EQ(0, kill(tracee_pid, SIGSTOP)) { + /* cannot happen */ + TH_LOG("kill SIGSTOP: %m"); + } + for (i = 0; i < ARRAY_SIZE(si); ++i) { + rc = syscall(si[i].entry[0].nr, + si[i].entry[0].args[0], + si[i].entry[0].args[1], + si[i].entry[0].args[2], + si[i].entry[0].args[3], + si[i].entry[0].args[4], + si[i].entry[0].args[5]); + if (si[i].exit[1].is_error) { + if (rc != -1 || errno != -si[i].exit[1].rval) + break; + } else { + if (rc != si[i].exit[1].rval) + break; + } + } + /* + * Something went wrong, but in this state tracee + * cannot reliably issue syscalls, so just crash. + */ + *(volatile unsigned char *) (uintptr_t) i = 42; + /* unreachable */ + _exit(i + 1); + } + + for (ptrace_stop = 0; ; ++ptrace_stop) { + struct ptrace_syscall_info info = { + .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */ + }; + const size_t size = sizeof(info); + const int expected_entry_size = + (void *) &info.entry.args[6] - (void *) &info; + const int expected_exit_size = + (void *) (&info.exit.is_error + 1) - + (void *) &info; + int status; + + ASSERT_EQ(tracee_pid, wait(&status)) { + /* cannot happen */ + LOG_KILL_TRACEE("wait: %m"); + } + if (WIFEXITED(status)) { + tracee_pid = 0; /* the tracee is no more */ + ASSERT_EQ(0, WEXITSTATUS(status)) { + LOG_KILL_TRACEE("unexpected exit status %u", + WEXITSTATUS(status)); + } + break; + } + ASSERT_FALSE(WIFSIGNALED(status)) { + tracee_pid = 0; /* the tracee is no more */ + LOG_KILL_TRACEE("unexpected signal %u", + WTERMSIG(status)); + } + ASSERT_TRUE(WIFSTOPPED(status)) { + /* cannot happen */ + LOG_KILL_TRACEE("unexpected wait status %#x", status); + } + + ASSERT_LT(ptrace_stop, ARRAY_SIZE(si) * 2) { + LOG_KILL_TRACEE("ptrace stop overflow"); + } + + switch (WSTOPSIG(status)) { + case SIGSTOP: + ASSERT_EQ(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected signal stop"); + } + ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, tracee_pid, + 0, PTRACE_O_TRACESYSGOOD)) { + LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m"); + } + break; + + case SIGTRAP | 0x80: + ASSERT_LT(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected syscall stop"); + } + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1: %m"); + } + if (ptrace_stop & 1) { + /* entering syscall */ + const struct si_entry *exp_entry = + &si[ptrace_stop / 2].entry[0]; + const struct si_entry *set_entry = + &si[ptrace_stop / 2].entry[1]; + + /* check ptrace_syscall_info before the changes */ + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1" + ": entry stop mismatch"); + } + check_psi_entry(_metadata, &info, exp_entry, + "PTRACE_GET_SYSCALL_INFO #1"); + + /* apply the changes */ + info.entry.nr = set_entry->nr; + for (i = 0; i < ARRAY_SIZE(set_entry->args); ++i) + info.entry.args[i] = set_entry->args[i]; + ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info)) { + LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m"); + } + + /* check ptrace_syscall_info after the changes */ + memset(&info, 0, sizeof(info)); + info.op = 0xff; + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2" + ": entry stop mismatch"); + } + check_psi_entry(_metadata, &info, set_entry, + "PTRACE_GET_SYSCALL_INFO #2"); + } else { + /* exiting syscall */ + const struct si_exit *exp_exit = + &si[ptrace_stop / 2 - 1].exit[0]; + const struct si_exit *set_exit = + &si[ptrace_stop / 2 - 1].exit[1]; + + /* check ptrace_syscall_info before the changes */ + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1" + ": exit stop mismatch"); + } + check_psi_exit(_metadata, &info, exp_exit, + "PTRACE_GET_SYSCALL_INFO #1"); + + /* apply the changes */ + info.exit.is_error = set_exit->is_error; + info.exit.rval = set_exit->rval; + ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info)) { + LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m"); + } + + /* check ptrace_syscall_info after the changes */ + memset(&info, 0, sizeof(info)); + info.op = 0xff; + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2: %m"); + } + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2" + ": exit stop mismatch"); + } + check_psi_exit(_metadata, &info, set_exit, + "PTRACE_GET_SYSCALL_INFO #2"); + } + break; + + default: + LOG_KILL_TRACEE("unexpected stop signal %u", + WSTOPSIG(status)); + abort(); + } + + ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, tracee_pid, 0, 0)) { + LOG_KILL_TRACEE("PTRACE_SYSCALL: %m"); + } + } + + ASSERT_EQ(ptrace_stop, ARRAY_SIZE(si) * 2); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index b2f76a52215ad..d566e40a6028f 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -73,6 +73,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -4899,7 +4907,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4952,35 +4989,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, +}; + +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4990,12 +5038,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5006,7 +5059,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5020,11 +5073,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5037,7 +5096,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5054,11 +5113,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5073,11 +5135,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 66dbb362385f3..0f97fb0d19e19 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -150,7 +150,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list) { if (kmalloc_verbose) - pr_debug("Bulk free %p[0-%lu]\n", list, size - 1); + pr_debug("Bulk free %p[0-%zu]\n", list, size - 1); pthread_mutex_lock(&cachep->lock); for (int i = 0; i < size; i++) @@ -168,7 +168,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, size_t i; if (kmalloc_verbose) - pr_debug("Bulk alloc %lu\n", size); + pr_debug("Bulk alloc %zu\n", size); pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs >= size) { diff --git a/tools/testing/shared/linux/cleanup.h b/tools/testing/shared/linux/cleanup.h new file mode 100644 index 0000000000000..ea3081426ee95 --- /dev/null +++ b/tools/testing/shared/linux/cleanup.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "../../../../include/linux/cleanup.h" diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 11f761769b5b7..7cfd6e31db10d 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -185,6 +185,15 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, vmg->__adjust_next_start = false; } +/* Helper function to set both the VMG range and its anon_vma. */ +static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t flags, + struct anon_vma *anon_vma) +{ + vmg_set_range(vmg, start, end, pgoff, flags); + vmg->anon_vma = anon_vma; +} + /* * Helper function to try to merge a new VMA. * @@ -265,6 +274,22 @@ static void dummy_close(struct vm_area_struct *) { } +static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) +{ + vma->anon_vma = anon_vma; + INIT_LIST_HEAD(&vma->anon_vma_chain); + list_add(&avc->same_vma, &vma->anon_vma_chain); + avc->anon_vma = vma->anon_vma; +} + +static void vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc) +{ + __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma); +} + static bool test_simple_merge(void) { struct vm_area_struct *vma; @@ -953,6 +978,7 @@ static bool test_merge_existing(void) const struct vm_operations_struct vm_ops = { .close = dummy_close, }; + struct anon_vma_chain avc = {}; /* * Merge right case - partial span. @@ -968,10 +994,10 @@ static bool test_merge_existing(void) vma->vm_ops = &vm_ops; /* This should have no impact. */ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma); vmg.middle = vma; vmg.prev = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_next); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x3000); @@ -1001,9 +1027,9 @@ static bool test_merge_existing(void) vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags); + vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, flags, &dummy_anon_vma); vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_next); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x2000); @@ -1030,11 +1056,10 @@ static bool test_merge_existing(void) vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; - + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1064,10 +1089,10 @@ static bool test_merge_existing(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1094,10 +1119,10 @@ static bool test_merge_existing(void) vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1180,12 +1205,9 @@ static bool test_anon_vma_non_mergeable(void) .mm = &mm, .vmi = &vmi, }; - struct anon_vma_chain dummy_anon_vma_chain1 = { - .anon_vma = &dummy_anon_vma, - }; - struct anon_vma_chain dummy_anon_vma_chain2 = { - .anon_vma = &dummy_anon_vma, - }; + struct anon_vma_chain dummy_anon_vma_chain_1 = {}; + struct anon_vma_chain dummy_anon_vma_chain_2 = {}; + struct anon_vma dummy_anon_vma_2; /* * In the case of modified VMA merge, merging both left and right VMAs @@ -1209,24 +1231,11 @@ static bool test_anon_vma_non_mergeable(void) * * However, when prev is compared to next, the merge should fail. */ - - INIT_LIST_HEAD(&vma_prev->anon_vma_chain); - list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); - ASSERT_TRUE(list_is_singular(&vma_prev->anon_vma_chain)); - vma_prev->anon_vma = &dummy_anon_vma; - ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_prev->anon_vma, vma_prev)); - - INIT_LIST_HEAD(&vma_next->anon_vma_chain); - list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); - ASSERT_TRUE(list_is_singular(&vma_next->anon_vma_chain)); - vma_next->anon_vma = (struct anon_vma *)2; - ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_next->anon_vma, vma_next)); - - ASSERT_FALSE(is_mergeable_anon_vma(vma_prev->anon_vma, vma_next->anon_vma, NULL)); - - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL); vmg.prev = vma_prev; vmg.middle = vma; + vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); + __vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2); ASSERT_EQ(merge_existing(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); @@ -1253,17 +1262,12 @@ static bool test_anon_vma_non_mergeable(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); - INIT_LIST_HEAD(&vma_prev->anon_vma_chain); - list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); - vma_prev->anon_vma = (struct anon_vma *)1; - - INIT_LIST_HEAD(&vma_next->anon_vma_chain); - list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); - vma_next->anon_vma = (struct anon_vma *)2; - - vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL); vmg.prev = vma_prev; + vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); + __vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2); + vmg.anon_vma = NULL; ASSERT_EQ(merge_new(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -1363,8 +1367,8 @@ static bool test_dup_anon_vma(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); - - vma->anon_vma = &dummy_anon_vma; + vmg.anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1392,7 +1396,7 @@ static bool test_dup_anon_vma(void) vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1420,7 +1424,7 @@ static bool test_dup_anon_vma(void) vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags); vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); - vma->anon_vma = &dummy_anon_vma; + vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.prev = vma; vmg.middle = vma; @@ -1447,6 +1451,7 @@ static bool test_vmi_prealloc_fail(void) .mm = &mm, .vmi = &vmi, }; + struct anon_vma_chain avc = {}; struct vm_area_struct *vma_prev, *vma; /* @@ -1459,9 +1464,10 @@ static bool test_vmi_prealloc_fail(void) vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); vma->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; + vma_set_dummy_anon_vma(vma, &avc); fail_prealloc = true;