FakeMeetingsDetector/url_analyzer.py at main · Macciox/FakeMeetingsDetector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import re
import validators
from urllib.parse import urlparse
from domain_checker import DomainChecker
from api_clients import SecurityAPIClient
from config import SUSPICIOUS_KEYWORDS

class URLAnalyzer:
    def __init__(self):
        self.domain_checker = DomainChecker()
        self.security_client = SecurityAPIClient()

    def extract_urls(self, text):
        """Extract URLs from text"""
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        urls = re.findall(url_pattern, text)

        # Also check for URLs without protocol
        domain_pattern = r'(?:www\.)?[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*\.[a-zA-Z]{2,}'
        potential_urls = re.findall(domain_pattern, text)

        # Add http:// to potential URLs and validate
        for potential_url in potential_urls:
            full_url = f"http://{potential_url}"
            if validators.url(full_url) and full_url not in urls:
                urls.append(full_url)

        return urls

    def analyze_url(self, url):
        """Comprehensive URL analysis"""
        if not validators.url(url):
            return {'error': 'Invalid URL format'}

        try:
            # Parse URL
            parsed = urlparse(url)

            # Initialize analysis results
            analysis = {
                'url': url,
                'domain': parsed.netloc.lower(),
                'path': parsed.path,
                'query': parsed.query,
                'security_level': 'SAFE',  # SAFE, SUSPICIOUS, DANGEROUS
                'confidence': 0,
                'issues': [],
                'recommendations': []
            }

            # Domain analysis
            domain_results = self.domain_checker.check_domain(url)
            if 'error' not in domain_results:
                analysis['domain_analysis'] = domain_results
                analysis['issues'].extend(domain_results['issues'])

            # Security API checks
            security_results = self.security_client.comprehensive_check(url)
            analysis['security_analysis'] = security_results

            # URL structure analysis
            structure_issues = self._analyze_url_structure(url)
            analysis['issues'].extend(structure_issues)

            # Content analysis
            content_issues = self._analyze_url_content(url)
            analysis['issues'].extend(content_issues)

            # Calculate final security assessment
            analysis = self._calculate_final_assessment(analysis)

            return analysis

        except Exception as e:
            return {'error': str(e)}

    def _analyze_url_structure(self, url):
        """Analyze URL structure for suspicious patterns"""
        issues = []
        parsed = urlparse(url)

        # Check for excessive subdomains
        domain_parts = parsed.netloc.split('.')
        if len(domain_parts) > 4:
            issues.append("Excessive subdomains detected")

        # Check for suspicious path patterns
        if '/redirect' in parsed.path or '/r/' in parsed.path:
            issues.append("Contains redirect patterns")

        # Check for URL shorteners (basic detection)
        shortener_domains = ['bit.ly', 'tinyurl.com', 't.co', 'goo.gl', 'ow.ly']
        if any(shortener in parsed.netloc for shortener in shortener_domains):
            issues.append("Uses URL shortener service")

        # Check for suspicious query parameters
        if 'token' in parsed.query or 'auth' in parsed.query:
            issues.append("Contains authentication tokens in URL")

        return issues

    def _analyze_url_content(self, url):
        """Analyze URL content for suspicious keywords"""
        issues = []
        url_lower = url.lower()

        found_keywords = [keyword for keyword in SUSPICIOUS_KEYWORDS if keyword in url_lower]
        if found_keywords:
            issues.append(f"Contains suspicious keywords: {', '.join(found_keywords)}")

        return issues

    def _calculate_final_assessment(self, analysis):
        """Calculate final security assessment"""
        risk_score = 0

        # Domain analysis scoring
        domain_analysis = analysis.get('domain_analysis', {})
        if not domain_analysis.get('is_legitimate', True):
            risk_score += 40

        if domain_analysis.get('typosquatting_score', 0) > 0:
            risk_score += min(domain_analysis['typosquatting_score'], 30)

        if domain_analysis.get('domain_age_days') is not None:
            if domain_analysis['domain_age_days'] < 7:
                risk_score += 30
            elif domain_analysis['domain_age_days'] < 30:
                risk_score += 15

        if not domain_analysis.get('ssl_valid', True):
            risk_score += 20

        if domain_analysis.get('suspicious_tld', False):
            risk_score += 15

        # Security API scoring
        security_analysis = analysis.get('security_analysis', {})
        security_score = security_analysis.get('security_score', 100)
        risk_score += (100 - security_score)

        # Structure and content issues
        risk_score += len(analysis['issues']) * 5

        # Determine security level
        if risk_score >= 70:
            analysis['security_level'] = 'DANGEROUS'
            analysis['confidence'] = min(95, 70 + (risk_score - 70) * 0.5)
        elif risk_score >= 30:
            analysis['security_level'] = 'SUSPICIOUS'
            analysis['confidence'] = min(85, 50 + (risk_score - 30) * 0.875)
        else:
            analysis['security_level'] = 'SAFE'
            analysis['confidence'] = max(60, 100 - risk_score * 2)

        # Add recommendations
        analysis['recommendations'] = self._generate_recommendations(analysis)

        return analysis

    def _generate_recommendations(self, analysis):
        """Generate security recommendations"""
        recommendations = []

        if analysis['security_level'] == 'DANGEROUS':
            recommendations.append("🚨 DO NOT CLICK this link")
            recommendations.append("Report this link as phishing")
            recommendations.append("Delete the message containing this link")

        elif analysis['security_level'] == 'SUSPICIOUS':
            recommendations.append("⚠️ Exercise extreme caution")
            recommendations.append("Verify the sender's identity")
            recommendations.append("Check the legitimate website directly")

        else:
            recommendations.append("✅ Link appears safe")
            recommendations.append("Always verify meeting invitations through official channels")

        # Add specific recommendations based on domain type
        domain_analysis = analysis.get('domain_analysis', {})
        if not domain_analysis.get('is_legitimate', True):
            legitimate_examples = self._get_legitimate_examples(analysis['domain'])
            if legitimate_examples:
                recommendations.append(f"Legitimate links look like: {legitimate_examples}")

        return recommendations

    def _get_legitimate_examples(self, suspicious_domain):
        """Get examples of legitimate domains"""
        if 'meet' in suspicious_domain or 'google' in suspicious_domain:
            return "https://meet.google.com/abc-defg-hij"
        elif 'zoom' in suspicious_domain:
            return "https://zoom.us/j/1234567890"
        elif 'teams' in suspicious_domain or 'microsoft' in suspicious_domain:
            return "https://teams.microsoft.com/l/meetup-join/..."

        return None