From 1687136e033f3f73494f228d83c80702933616a5 Mon Sep 17 00:00:00 2001 From: Matt <85322+mattmassicotte@users.noreply.github.com> Date: Wed, 1 May 2024 13:18:50 -0400 Subject: [PATCH] Some actual parsing --- Package.swift | 10 ++ Sources/Lowlight/Language.swift | 61 ++++++++-- Sources/Lowlight/Processor.swift | 138 ++++++++++++++++++++--- Tests/LowlightTests/LanguageTests.swift | 2 +- Tests/LowlightTests/ProcessorTests.swift | 92 +++++++++++++++ 5 files changed, 274 insertions(+), 29 deletions(-) create mode 100644 Tests/LowlightTests/ProcessorTests.swift diff --git a/Package.swift b/Package.swift index 1581d3c..14920f4 100644 --- a/Package.swift +++ b/Package.swift @@ -27,3 +27,13 @@ let package = Package( ), ] ) + +let swiftSettings: [SwiftSetting] = [ + .enableExperimentalFeature("StrictConcurrency") +] + +for target in package.targets { + var settings = target.swiftSettings ?? [] + settings.append(contentsOf: swiftSettings) + target.swiftSettings = settings +} diff --git a/Sources/Lowlight/Language.swift b/Sources/Lowlight/Language.swift index 70d35a0..5ac2933 100644 --- a/Sources/Lowlight/Language.swift +++ b/Sources/Lowlight/Language.swift @@ -1,20 +1,59 @@ import Foundation -/// The model of the Lowlight language. -public struct Language: Codable, Hashable, Sendable { - public let keywords: Set - public let symbols: Set +public struct Pattern: Hashable, Sendable { + public enum Behavior: Hashable, Sendable { + case exact(String) + case toEndOfLine(String) + + public func matches(_ character: Character, at offset: Int) -> Bool { + switch self { + case let .exact(string): + let patternIdx = string.index(string.startIndex, offsetBy: offset) - public init(keywords: Set = [], symbols: Set = []) { - self.keywords = keywords - self.symbols = symbols + return string[patternIdx] == character + case let .toEndOfLine(string): + let patternIdx = string.index(string.startIndex, offsetBy: offset) + + return string[patternIdx] == character + } + } } - var keywordPattern: String { - "(" + keywords.joined(separator: "|") + ")" + public enum Element: Hashable, Sendable { + case keyword + case comment + + public var treeSitterHighlightName: String { + switch self { + case .comment: + "comment" + case .keyword: + "keyword" + } + } + } + + public let element: Element + public let match: Behavior + + public init(element: Element, match: Behavior) { + self.element = element + self.match = match } +} + +/// The model of the Lowlight language. +public struct Language: Hashable, Sendable { + public let patterns: Set + + public init(patterns: Set) { + self.patterns = patterns + } + + public init(keywords: Set, lineComment: String) { + let keywordPatterns = keywords.map({ Pattern(element: .keyword, match: .exact($0)) }) + let patterns = keywordPatterns + [.init(element: .comment, match: .toEndOfLine(lineComment))] - var symbolsPattern: String { - "(" + symbols.joined(separator: "|") + ")" + self.init(patterns: Set(patterns)) } } diff --git a/Sources/Lowlight/Processor.swift b/Sources/Lowlight/Processor.swift index 3f51e8e..ce63419 100644 --- a/Sources/Lowlight/Processor.swift +++ b/Sources/Lowlight/Processor.swift @@ -2,8 +2,20 @@ import Foundation /// Represents a range in the input with an associated identifier. public struct Token: Hashable, Sendable { - public let name: String + public let element: Pattern.Element public let range: NSRange + + public init(_ element: Pattern.Element, range: NSRange) { + self.element = element + self.range = range + } + + init(_ pattern: Pattern, input: String, start: String.Index, offset: Int) { + let end = input.index(start, offsetBy: offset) + + self.range = NSRange(start.. Bool { + character.unicodeScalars.allSatisfy { contains($0) } + } +} + /// Type that uses a language model to produce tokens and scopes. public struct Processor { public struct Output: Hashable, Sendable { @@ -18,6 +36,23 @@ public struct Processor { public let scopes: [Scope] } + private enum State { + case scanning + case matching(patterns: Set, start: String.Index, length: Int) + case matched(Pattern, start: String.Index, length: Int) + case advancingUntil(CharacterSet, pattern: Pattern, start: String.Index) + case skipping + case advanceToBoundary + + mutating func advance() { + guard case let .matching(patterns: patterns, start: start, length: length) = self else { + fatalError() + } + + self = .matching(patterns: patterns, start: start, length: length + 1) + } + } + public let language: Language public init(language: Language) { @@ -32,28 +67,97 @@ public struct Processor { } public func processTokens(for input: String) -> [Token] { + var state = State.scanning var tokens = [Token]() - if let pattern = try? NSRegularExpression(pattern: language.keywordPattern), language.keywords.isEmpty == false { - let matches = pattern.matches(in: input, range: NSRange(0.. 0 else { continue } - - tokens.append(Token(name: "keyword", range: range)) + var index = input.startIndex + let patterns = language.patterns + let anchors = CharacterSet.whitespacesAndNewlines + + while index < input.endIndex { + let char = input[index] + + switch state { + case .scanning: + let possible = patterns.filter { $0.match.matches(char, at: 0) } + + if possible.isEmpty { + state = .skipping + continue + } + + state = .matching(patterns: possible, start: index, length: 1) + case .skipping: + if anchors.contains(char) { + state = .advanceToBoundary + } + case .advanceToBoundary: + if anchors.contains(char) == false { + state = .scanning + continue + } + case let .matching(patterns: active, start: start, length: length): + let newLength = length + 1 + + let remaining = active.filter { $0.match.matches(char, at: length) } + + // no matches, continue + guard remaining.isEmpty == false else { + state = .scanning + continue + } + + // more than one match, keep going + guard let pattern = remaining.first, remaining.count == 1 else { + state = .matching(patterns: remaining, start: start, length: newLength) + break + } + + switch pattern.match { + case let .exact(value): + if newLength < value.count { + state = .matching(patterns: remaining, start: start, length: newLength) + break + } + + state = .matched(pattern, start: start, length: newLength) + case .toEndOfLine: + state = .advancingUntil(.newlines, pattern: pattern, start: start) + } + case let .matched(pattern, start: start, length: length): + if anchors.contains(char) == false { + state = .skipping + break + } + + tokens.append(Token(pattern, input: input, start: start, offset: length)) + + state = .scanning + case let .advancingUntil(charSet, pattern: pattern, start: start): + guard charSet.contains(char) else { + break + } + + let range = NSRange(start.. 0 else { continue } + switch state { + case let .advancingUntil(_, pattern: pattern, start: start): + let range = NSRange(start..