diff --git a/CO_AUTHORS.md b/CO_AUTHORS.md index d4496f3ba..42e43ea8a 100644 --- a/CO_AUTHORS.md +++ b/CO_AUTHORS.md @@ -157,6 +157,17 @@ If it does, the backend will process the co-authors as follows, assume trailer v - Third we lookup for email using GitHub API. If the user is found, we use that user as co-author. -- Finally we use the name part for `name ` and lookup using GitHub API assuming that this name is GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author. +- Finally we use the name part for `name `. If the name matches the GitHub username pattern (alphanumeric characters or hyphens, must be 3–39 characters long, cannot start or end with a hyphen, and cannot contain consecutive hyphens), then we lookup using the GitHub API assuming that this name is a GitHub username/login (this is the case for some bots). If the user is found, we use that user as co-author. We use internal caching while doing all those lookups with cache key `name` and `email` and TTL 24 hours. We even cache by `(name, email)` when nothing is found because this is the most time consuming option. It will have a chance to be found in the future (up to 24 hours from lookup). + + +# How to fix missing commit author message + +Make sure that co-authors use one of the following formats in their commit message: + +- `Co-authored-by: Any name ` - exact GitHub user will be found by unique `ID` part. +- `Co-authored-by: Any name ` - exact GitHub user will be found by unique `username` part. +- `Co-authored-by: Any name ` - GitHub user will be found by `public-email` part - that must be made public on GitHub. +- `Co-authored-by: github-login ` - GitHub user will be found by `github-login` part, (must be at least 3 characters long). + diff --git a/cla-backend-go/github/github_repository.go b/cla-backend-go/github/github_repository.go index 27a5a3603..be52a66b3 100644 --- a/cla-backend-go/github/github_repository.go +++ b/cla-backend-go/github/github_repository.go @@ -27,6 +27,7 @@ var ( ErrGitHubRepositoryNotFound = errors.New("github repository not found") NoreplyIDPattern = regexp.MustCompile(`^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`) NoreplyUserPattern = regexp.MustCompile(`^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$`) + GithubUsernameRegex = regexp.MustCompile(`^[A-Za-z0-9-]{3,39}$`) ) const ( @@ -359,6 +360,20 @@ func ExpandWithCoAuthors( } } +// IsValidGitHubUsername checks if the provided username is a valid GitHub username. +func IsValidGitHubUsername(username string) bool { + if !GithubUsernameRegex.MatchString(username) { + return false + } + if strings.HasPrefix(username, "-") || strings.HasSuffix(username, "-") { + return false + } + if strings.Contains(username, "--") { + return false + } + return true +} + func GetCoAuthorCommits( ctx context.Context, client *github.Client, @@ -449,7 +464,7 @@ func GetCoAuthorCommits( } // 4. Last resort - try to find by name=login - if user == nil { + if user == nil && IsValidGitHubUsername(name) { // Note that Co-authored-by: name is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile user, err = GetGithubUserByLogin(ctx, client, name) if err != nil { diff --git a/cla-backend/cla/models/github_models.py b/cla-backend/cla/models/github_models.py index 53301a7f8..ab1496215 100644 --- a/cla-backend/cla/models/github_models.py +++ b/cla-backend/cla/models/github_models.py @@ -36,6 +36,9 @@ EXCLUDE_GITHUB_EMAILS = ["noreply.github.com"] NOREPLY_ID_PATTERN = re.compile(r"^(\d+)\+([a-zA-Z0-9-]+)@users\.noreply\.github\.com$") NOREPLY_USER_PATTERN = re.compile(r"^([a-zA-Z0-9-]+)@users\.noreply\.github\.com$") +# GitHub usernames must be 3-39 characters long, can only contain alphanumeric characters or hyphens, +# cannot begin or end with a hyphen, and cannot contain consecutive hyphens. +GITHUB_USERNAME_REGEX = re.compile(r'^(?!-)(?!.*--)[A-Za-z0-9-]{3,39}(? bool: + return bool(GITHUB_USERNAME_REGEX.match(username)) def get_co_author_commits(co_author, commit, pr, installation_id): fn = "cla.models.github_models.get_co_author_commits" @@ -1965,7 +1970,7 @@ def get_co_author_commits(co_author, commit, pr, installation_id): user = None # 4. Last resort: try to find by name (login) - if user is None: + if user is None and is_valid_github_username(name): try: # Note that Co-authored-by: name is not actually a GitHub login but rather a name - but we are trying hard to find a GitHub profile cla.log.debug(f"{fn} - Lookup via login=name: {name}") diff --git a/tests/functional/README.md b/tests/functional/README.md index dc2ffa8f7..d78e6b499 100644 --- a/tests/functional/README.md +++ b/tests/functional/README.md @@ -88,7 +88,7 @@ CYPRESS_ENV=dev You can ask for example `.env` file over slack. - Run `npx cypress install` -- Run tests using cmd `npx cypress run`. +- Run tests using cmd `npx cypress run`. Or `xvfb-run -a npx cypress run` when runnign over SSH. - Run tests using UI `npx cypress open`. Choose **E2E testing**, select **Chrome** browser. - View test reports in the `cypress-report` directory. - Explore source code files for detailed implementation. diff --git a/utils/calculate_api_stats.sh b/utils/calculate_api_stats.sh new file mode 100755 index 000000000..af4143c3e --- /dev/null +++ b/utils/calculate_api_stats.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright The Linux Foundation and each contributor to CommunityBridge. +# SPDX-License-Identifier: MIT + + +if [ -z "$STAGE" ] +then + export STAGE=prod +fi +if [ -z "$1" ] +then + echo "$0: please provide time range from value as a 1st argument, for example '2 hours ago'" + exit 1 +fi +export DTFROM="${1}" +REGION=us-east-1 NO_ECHO=1 DTTO='1 second ago' OUT="api-logs-${STAGE}-1.json" ./utils/search_aws_logs.sh 'LG:api-request-path' +REGION=us-east-2 NO_ECHO=1 DTTO='1 second ago' OUT="api-logs-${STAGE}-2.json" ./utils/search_aws_logs.sh 'LG:api-request-path' +jq -s 'add' "api-logs-${STAGE}-1.json" "api-logs-${STAGE}-2.json" > "api-logs-${STAGE}.json" && rm -f "api-logs-${STAGE}-1.json" "api-logs-${STAGE}-2.json" +./utils/count_apis.sh "api-logs-${STAGE}.json" > "api-logs-${STAGE}.log" && cat "api-logs-${STAGE}.log" diff --git a/utils/count_apis.sh b/utils/count_apis.sh new file mode 100755 index 000000000..2ecf21a3a --- /dev/null +++ b/utils/count_apis.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright The Linux Foundation and each contributor to CommunityBridge. +# SPDX-License-Identifier: MIT + +if [ -z "${1}" ] +then + echo "Usage: $0 " + echo "Example: $0 api-logs-prod.json" + exit 1 +fi + +jq -r ' + .[].message + | capture("LG:api-request-path:(?

[^\"[:space:]]+)")? # find the path + | select(.) # drop non-matches + | .p +' "${1}" \ +| sed -E 's/[0-9a-fA-F-]{36}//g' \ +| sed -E ':a;s#/([0-9]{1,})(/|$)#/\2#g;ta' \ +| sed -E 's#/(00|a0)[A-Za-z0-9]{13,16}(/|$)#/\2#g' \ +| sort | uniq -c | sort -nr diff --git a/utils/search_aws_log_group.sh b/utils/search_aws_log_group.sh index 492ce8c24..135eef6eb 100755 --- a/utils/search_aws_log_group.sh +++ b/utils/search_aws_log_group.sh @@ -1,5 +1,9 @@ #!/bin/bash # STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' ./utils/search_aws_log_group.sh 'cla-backend-dev-githubactivity' 'error' +# REGION=us-east-2 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-go-api-v4-lambda' 'LG:api-request-path' +# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-api-v3-lambda' 'LG:api-request-path' +# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-apiv2' 'LG:api-request-path' +# REGION=us-east-1 STAGE=prod DEBUG=1 DTFROM='15 minutes ago' DTTO='1 second ago' ./utils/search_aws_log_group.sh 'cla-backend-prod-githubactivity' 'LG:api-request-path' if [ -z "$STAGE" ] then diff --git a/utils/search_aws_logs.sh b/utils/search_aws_logs.sh index 9795377c5..a3f16cce1 100755 --- a/utils/search_aws_logs.sh +++ b/utils/search_aws_logs.sh @@ -4,8 +4,8 @@ # SPDX-License-Identifier: MIT # REGION=us-east-1|us-east-2 STAGE=dev DEBUG=1 DTFROM='3 days ago' DTTO='2 days ago' OUT=logs.json ./utils/search_aws_logs.sh 'error' -# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-dev.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}//g' | sed -E 's/\b[0-9]{2,}\b//g' | sort | uniq -c | sort -nr -# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && jq -r '.[].message' api-logs-prod.json | grep -o 'LG:api-request-path:[^[:space:]]*' | sed 's/^LG:api-request-path://' | sed -E 's/[0-9a-fA-F-]{36}//g' | sed -E ':a;s#/([0-9]{1,})(/|$)#/\2#g;ta' | sort | uniq -c | sort -nr +# DEBUG=1 STAGE=dev REGION=us-east-1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-dev.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-dev.json +# DEBUG=1 STAGE=prod REGION=us-east-1 NO_ECHO=1 DTFROM='10 days ago' DTTO='1 second ago' OUT=api-logs-prod.json ./utils/search_aws_logs.sh 'LG:api-request-path' && ./utils/count_apis.sh api-logs-prod.json # To find distinct log groups: | jq -r 'map(.logGroupName) | unique | .[]' # in us-east-1 (mostly V1, V2 and V3): # To see specific log group: | jq 'map(select(.logGroupName == "/aws/lambda/cla-backend-dev-apiv1"))'