Skip to content

Commit

Permalink
Some actual parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mattmassicotte committed May 1, 2024
1 parent ec2687f commit 1687136
Show file tree
Hide file tree
Showing 5 changed files with 274 additions and 29 deletions.
10 changes: 10 additions & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,13 @@ let package = Package(
),
]
)

let swiftSettings: [SwiftSetting] = [
.enableExperimentalFeature("StrictConcurrency")
]

for target in package.targets {
var settings = target.swiftSettings ?? []
settings.append(contentsOf: swiftSettings)
target.swiftSettings = settings
}
61 changes: 50 additions & 11 deletions Sources/Lowlight/Language.swift
Original file line number Diff line number Diff line change
@@ -1,20 +1,59 @@
import Foundation

/// The model of the Lowlight language.
public struct Language: Codable, Hashable, Sendable {
public let keywords: Set<String>
public let symbols: Set<String>
public struct Pattern: Hashable, Sendable {
public enum Behavior: Hashable, Sendable {
case exact(String)
case toEndOfLine(String)

public func matches(_ character: Character, at offset: Int) -> Bool {
switch self {
case let .exact(string):
let patternIdx = string.index(string.startIndex, offsetBy: offset)

public init(keywords: Set<String> = [], symbols: Set<String> = []) {
self.keywords = keywords
self.symbols = symbols
return string[patternIdx] == character
case let .toEndOfLine(string):
let patternIdx = string.index(string.startIndex, offsetBy: offset)

return string[patternIdx] == character
}
}
}

var keywordPattern: String {
"(" + keywords.joined(separator: "|") + ")"
public enum Element: Hashable, Sendable {
case keyword
case comment

public var treeSitterHighlightName: String {
switch self {
case .comment:
"comment"
case .keyword:
"keyword"
}
}
}

public let element: Element
public let match: Behavior

public init(element: Element, match: Behavior) {
self.element = element
self.match = match
}
}

/// The model of the Lowlight language.
public struct Language: Hashable, Sendable {
public let patterns: Set<Pattern>

public init(patterns: Set<Pattern>) {
self.patterns = patterns
}

public init(keywords: Set<String>, lineComment: String) {
let keywordPatterns = keywords.map({ Pattern(element: .keyword, match: .exact($0)) })
let patterns = keywordPatterns + [.init(element: .comment, match: .toEndOfLine(lineComment))]

var symbolsPattern: String {
"(" + symbols.joined(separator: "|") + ")"
self.init(patterns: Set(patterns))
}
}
138 changes: 121 additions & 17 deletions Sources/Lowlight/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,57 @@ import Foundation

/// Represents a range in the input with an associated identifier.
public struct Token: Hashable, Sendable {
public let name: String
public let element: Pattern.Element
public let range: NSRange

public init(_ element: Pattern.Element, range: NSRange) {
self.element = element
self.range = range
}

init(_ pattern: Pattern, input: String, start: String.Index, offset: Int) {
let end = input.index(start, offsetBy: offset)

self.range = NSRange(start..<end, in: input)
self.element = pattern.element
}
}

/// Represents a textual scope.
public struct Scope: Hashable, Sendable {

}

extension CharacterSet {
func contains(_ character: Character) -> Bool {
character.unicodeScalars.allSatisfy { contains($0) }
}
}

/// Type that uses a language model to produce tokens and scopes.
public struct Processor {
public struct Output: Hashable, Sendable {
public let tokens: [Token]
public let scopes: [Scope]
}

private enum State {
case scanning
case matching(patterns: Set<Pattern>, start: String.Index, length: Int)
case matched(Pattern, start: String.Index, length: Int)
case advancingUntil(CharacterSet, pattern: Pattern, start: String.Index)
case skipping
case advanceToBoundary

mutating func advance() {
guard case let .matching(patterns: patterns, start: start, length: length) = self else {
fatalError()
}

self = .matching(patterns: patterns, start: start, length: length + 1)
}
}

public let language: Language

public init(language: Language) {
Expand All @@ -32,28 +67,97 @@ public struct Processor {
}

public func processTokens(for input: String) -> [Token] {
var state = State.scanning
var tokens = [Token]()

if let pattern = try? NSRegularExpression(pattern: language.keywordPattern), language.keywords.isEmpty == false {
let matches = pattern.matches(in: input, range: NSRange(0..<input.utf16.count))

for match in matches {
let range = match.range(at: 0)
guard range.length > 0 else { continue }

tokens.append(Token(name: "keyword", range: range))
var index = input.startIndex
let patterns = language.patterns
let anchors = CharacterSet.whitespacesAndNewlines

while index < input.endIndex {
let char = input[index]

switch state {
case .scanning:
let possible = patterns.filter { $0.match.matches(char, at: 0) }

if possible.isEmpty {
state = .skipping
continue
}

state = .matching(patterns: possible, start: index, length: 1)
case .skipping:
if anchors.contains(char) {
state = .advanceToBoundary
}
case .advanceToBoundary:
if anchors.contains(char) == false {
state = .scanning
continue
}
case let .matching(patterns: active, start: start, length: length):
let newLength = length + 1

let remaining = active.filter { $0.match.matches(char, at: length) }

// no matches, continue
guard remaining.isEmpty == false else {
state = .scanning
continue
}

// more than one match, keep going
guard let pattern = remaining.first, remaining.count == 1 else {
state = .matching(patterns: remaining, start: start, length: newLength)
break
}

switch pattern.match {
case let .exact(value):
if newLength < value.count {
state = .matching(patterns: remaining, start: start, length: newLength)
break
}

state = .matched(pattern, start: start, length: newLength)
case .toEndOfLine:
state = .advancingUntil(.newlines, pattern: pattern, start: start)
}
case let .matched(pattern, start: start, length: length):
if anchors.contains(char) == false {
state = .skipping
break
}

tokens.append(Token(pattern, input: input, start: start, offset: length))

state = .scanning
case let .advancingUntil(charSet, pattern: pattern, start: start):
guard charSet.contains(char) else {
break
}

let range = NSRange(start..<index, in: input)

tokens.append(Token(pattern.element, range: range))

state = .scanning
continue
}
}

if let pattern = try? NSRegularExpression(pattern: language.symbolsPattern), language.symbols.isEmpty == false {
let matches = pattern.matches(in: input, range: NSRange(0..<input.utf16.count))
index = input.index(after: index)
}

for match in matches {
let range = match.range(at: 0)
guard range.length > 0 else { continue }
switch state {
case let .advancingUntil(_, pattern: pattern, start: start):
let range = NSRange(start..<index, in: input)

tokens.append(Token(name: "keyword.operator.text", range: range))
}
tokens.append(Token(pattern.element, range: range))
case let .matched(pattern, start: start, length: length):
tokens.append(Token(pattern, input: input, start: start, offset: length))
default:
break
}

return tokens
Expand Down
2 changes: 1 addition & 1 deletion Tests/LowlightTests/LanguageTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import Lowlight

final class LanguageTests: XCTestCase {
func testLanguageWithOneKeyword() throws {
let language = Language()
let language = Language(patterns: [])

XCTAssertNotNil(language)
}
Expand Down
92 changes: 92 additions & 0 deletions Tests/LowlightTests/ProcessorTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import XCTest
import Lowlight

final class ProcessorTests: XCTestCase {
func testLineComments() throws {
let language = Language(patterns: [Pattern(element: .comment, match: .toEndOfLine("//"))])
let processor = Processor(language: language)

let input = """
// leading comment
// after whitespace
abc def // trailing comment
// comment inside // comment
"""

let tokens = processor.processTokens(for: input)
let expected = [
Token(.comment, range: NSRange(0..<18)),
Token(.comment, range: NSRange(22..<41)),
Token(.comment, range: NSRange(50..<69)),
Token(.comment, range: NSRange(70..<98)),
]

XCTAssertEqual(tokens, expected)
}

func testSingleExactMatch() throws {
let language = Language(patterns: [Pattern(element: .keyword, match: .exact("abc"))])
let processor = Processor(language: language)

let tokens = processor.processTokens(for: "abc ")

let expected = [
Token(.keyword, range: NSRange(0..<3)),
]

XCTAssertEqual(tokens, expected)
}

func testSingleExactMatchEOF() throws {
let language = Language(patterns: [Pattern(element: .keyword, match: .exact("abc"))])
let processor = Processor(language: language)

let tokens = processor.processTokens(for: "abc")

let expected = [
Token(.keyword, range: NSRange(0..<3)),
]

XCTAssertEqual(tokens, expected)
}

func testExactMatches() throws {
let language = Language(patterns: [Pattern(element: .keyword, match: .exact("abc"))])
let processor = Processor(language: language)

let input = """
abc
dabc abc
abc
abcabc
"""

let tokens = processor.processTokens(for: input)
let expected = [
Token(.keyword, range: NSRange(0..<3)),
Token(.keyword, range: NSRange(9..<12)),
Token(.keyword, range: NSRange(14..<17)),
]

XCTAssertEqual(tokens, expected)
}

func testEndOfLineFollowedByNewlineThenExact() throws {
let language = Language(keywords: ["abc"], lineComment: "--")
let processor = Processor(language: language)

let input = """
-- comment
abc def
"""

let tokens = processor.processTokens(for: input)
let expected = [
Token(.comment, range: NSRange(0..<10)),
Token(.keyword, range: NSRange(12..<15)),
]

XCTAssertEqual(tokens, expected)
}
}

0 comments on commit 1687136

Please sign in to comment.