Add LLM-based evaluation for WordPressIntelligence #25059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

kean wants to merge 6 commits into trunk from task/llm-as-judge

Modules/Package.swift

-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,7 @@ let package = Package( @@
             .library(name: "WordPressFlux", targets: ["WordPressFlux"]),
             .library(name: "WordPressShared", targets: ["WordPressShared"]),
             .library(name: "WordPressUI", targets: ["WordPressUI"]),
+            .library(name: "WordPressIntelligence", targets: ["WordPressIntelligence"]),
             .library(name: "WordPressReader", targets: ["WordPressReader"]),
             .library(name: "WordPressCore", targets: ["WordPressCore"]),
             .library(name: "WordPressCoreProtocols", targets: ["WordPressCoreProtocols"]),
@@ Expand Down Expand Up / @@ -163,6 +164,10 @@ let package = Package( @@
                 // This package should never have dependencies – it exists to expose protocols implemented in WordPressCore
                 // to UI code, because `wordpress-rs` doesn't work nicely with previews.
             ]),
+            .target(name: "WordPressIntelligence", dependencies: [
+                "WordPressShared",
+                .product(name: "SwiftSoup", package: "SwiftSoup"),
+            ]),
             .target(name: "WordPressLegacy", dependencies: ["DesignSystem", "WordPressShared"]),
             .target(name: "WordPressSharedObjC", resources: [.process("Resources")], swiftSettings: [.swiftLanguageMode(.v5)]),
             .target(
@@ Expand Down Expand Up / @@ -251,6 +256,7 @@ let package = Package( @@
             .testTarget(name: "WordPressSharedObjCTests", dependencies: [.target(name: "WordPressShared"), .target(name: "WordPressTesting")], swiftSettings: [.swiftLanguageMode(.v5)]),
             .testTarget(name: "WordPressUIUnitTests", dependencies: [.target(name: "WordPressUI")], swiftSettings: [.swiftLanguageMode(.v5)]),
             .testTarget(name: "WordPressCoreTests", dependencies: [.target(name: "WordPressCore")]),
+            .testTarget(name: "WordPressIntelligenceTests", dependencies: [.target(name: "WordPressIntelligence")])
         ]
     )
@@ Expand Down Expand Up / @@ -348,6 +354,7 @@ enum XcodeSupport { @@
                 "ShareExtensionCore",
                 "Support",
                 "WordPressFlux",
+                "WordPressIntelligence",
                 "WordPressShared",
                 "WordPressLegacy",
                 "WordPressReader",
@@ Expand Down @@

Modules/Sources/WordPressIntelligence/IntelligenceService.swift

-Original file line number
+Diff line change
@@ -0,0 +1,52 @@
+    import Foundation
+    import FoundationModels
+    public enum IntelligenceService {
+        /// Maximum context size for language model sessions (in tokens).
+        ///
+        /// A single token corresponds to three or four characters in languages like
+        /// English, Spanish, or German, and one token per character in languages like
+        /// Japanese, Chinese, or Korean. In a single session, the sum of all tokens
+        /// in the instructions, all prompts, and all outputs count toward the context window size.
+        ///
+        /// https://developer.apple.com/documentation/foundationmodels/generating-content-and-performing-tasks-with-foundation-models#Consider-context-size-limits-per-session
+        public static let contextSizeLimit = 4096
+        /// Checks if intelligence features are supported on the current device.
+        public nonisolated static var isSupported: Bool {
+            guard #available(iOS 26, *) else {
+                return false
+            }
+            switch SystemLanguageModel.default.availability {
+            case .available:
+                return true
+            case .unavailable(let reason):
+                switch reason {
+                case .appleIntelligenceNotEnabled, .modelNotReady:
+                    return true
+                case .deviceNotEligible:
+                    return false
+                @unknown default:
+                    return false
+                }
+            }
+        }
+        /// Extracts relevant text from post content, removing HTML and limiting size.
+        public static func extractRelevantText(from post: String, ratio: CGFloat = 0.6) -> String {
+            let extract = try? ContentExtractor.extractRelevantText(from: post)
+            let postSizeLimit = Double(IntelligenceService.contextSizeLimit) * ratio
+            return String((extract ?? post).prefix(Int(postSizeLimit)))
+        }
+        // As documented in https://developer.apple.com/documentation/foundationmodels/supporting-languages-and-locales-with-foundation-models?changes=_10_5#Use-Instructions-to-set-the-locale-and-language
+        static func makeLocaleInstructions(for locale: Locale = Locale.current) -> String {
+            if Locale.Language(identifier: "en_US").isEquivalent(to: locale.language) {
+                // Skip the locale phrase for U.S. English.
+                return ""
+            } else {
+                // Specify the person's locale with the exact phrase format.
+                return "The person's locale is \(locale.identifier)."
+            }
+        }
+    }

Modules/Sources/WordPressIntelligence/Parameters/ContentLength.swift

-Original file line number
+Diff line change
@@ -0,0 +1,58 @@
+    import Foundation
+    import WordPressShared
+    /// Target length for generated text.
+    ///
+    /// Ranges are calibrated for English and account for cross-language variance.
+    /// Sentences are the primary indicator; word counts accommodate language differences.
+    ///
+    /// - **Short**: 1-2 sentences (15-35 words) - Social media, search snippets
+    /// - **Medium**: 2-4 sentences (30-90 words) - RSS feeds, blog listings
+    /// - **Long**: 5-7 sentences (90-130 words) - Detailed previews, newsletters
+    ///
+    /// Word ranges are intentionally wide (2-2.3x) to handle differences in language
+    /// structure (German compounds, Romance wordiness, CJK tokenization).
+    public enum ContentLength: Int, CaseIterable, Sendable {
+        case short
+        case medium
+        case long
+        public var displayName: String {
+            switch self {
+            case .short:
+                AppLocalizedString("generation.length.short", value: "Short", comment: "Generated content length (needs to be short)")
+            case .medium:
+                AppLocalizedString("generation.length.medium", value: "Medium", comment: "Generated content length (needs to be short)")
+            case .long:
+                AppLocalizedString("generation.length.long", value: "Long", comment: "Generated content length (needs to be short)")
+            }
+        }
+        public var trackingName: String {
+            switch self {
+            case .short: "short"
+            case .medium: "medium"
+            case .long: "long"
+            }
+        }
+        public var promptModifier: String {
+            "\(sentenceRange.lowerBound)-\(sentenceRange.upperBound) sentences (\(wordRange.lowerBound)-\(wordRange.upperBound) words)"
+        }
+        public var sentenceRange: ClosedRange<Int> {
+            switch self {
+            case .short: 1...2
+            case .medium: 2...4
+            case .long: 5...7
+            }
+        }
+        public var wordRange: ClosedRange<Int> {
+            switch self {
+            case .short: 15...35
+            case .medium: 40...80
+            case .long: 90...130
+            }
+        }
+    }

Modules/Sources/WordPressIntelligence/Parameters/WritingStyle.swift

-Original file line number
+Diff line change
@@ -0,0 +1,40 @@
+    import Foundation
+    import WordPressShared
+    /// Writing style for generated text.
+    public enum WritingStyle: String, CaseIterable, Sendable {
+        case engaging
+        case conversational
+        case witty
+        case formal
+        case professional
+        public var displayName: String {
+            switch self {
+            case .engaging:
+                AppLocalizedString("generation.style.engaging", value: "Engaging", comment: "AI generation style")
+            case .conversational:
+                AppLocalizedString("generation.style.conversational", value: "Conversational", comment: "AI generation style")
+            case .witty:
+                AppLocalizedString("generation.style.witty", value: "Witty", comment: "AI generation style")
+            case .formal:
+                AppLocalizedString("generation.style.formal", value: "Formal", comment: "AI generation style")
+            case .professional:
+                AppLocalizedString("generation.style.professional", value: "Professional", comment: "AI generation style")
+            }
+        }
+        var promptModifier: String {
+            "\(rawValue) (\(promptModifierDetails))"
+        }
+        var promptModifierDetails: String {
+            switch self {
+            case .engaging: "engaging and compelling tone"
+            case .witty: "witty, creative, entertaining"
+            case .conversational: "friendly and conversational tone"
+            case .formal: "formal and academic tone"
+            case .professional: "professional and polished tone"
+            }
+        }
+    }

Modules/Sources/WordPressIntelligence/README.md

-Original file line number
+Diff line change
@@ -0,0 +1,143 @@
+    # WordPressIntelligence
+    AI-powered content intelligence for WordPress using Apple Foundation Models.
+    ## Features
+    - **Excerpt Generation** - Generate 3 excerpt variations in 8 languages with configurable length/style
+    - **Tag Suggestions** - AI-powered tag recommendations
+    - **Post Summaries** - Automatic content summarization
+    ## Requirements
+    - iOS 26.0+
+    - Device with Apple Intelligence support
+    ## Usage
+    ```swift
+    let generator = ExcerptGeneration(length: .medium, style: .engaging)
+    let excerpts = try await generator.generate(for: postContent)
+    ```
+    **Languages**: English, Spanish, French, German, Italian, Portuguese, Japanese, Chinese
+    **Lengths**: Short (15-35 words), Medium (40-80 words), Long (90-130 words)
+    **Styles**: Engaging, Professional, Conversational, Formal, Witty
+    ## Testing
+    ### Standard XCTest
+    Run standard tests that verify language, length, and diversity:
+    ```bash
+    cd Modules
+    xcodebuild test \
+      -scheme Modules-Package \
+      -destination 'platform=iOS Simulator,name=iPhone 16 Pro,OS=26.0' \
+      -only-testing:WordPressIntelligenceTests
+    ```
+    ### Quality Evaluation
+    Evaluate AI-generated content quality using Claude scoring. Requires [Claude CLI](https://github.com/anthropics/claude-cli).
+    **Location**: `Modules/Tests/WordPressIntelligenceTests/`
+    ```bash
+    # Quick start
+    cd Modules/Tests/WordPressIntelligenceTests
+    make                    # Show all available commands
+    make eval               # Run full evaluation (all test types)
+    make eval-quick         # Run English excerpt evaluation
+    make eval TESTS="excerpts"       # Run only excerpt tests
+    make eval TESTS="excerpts tags"  # Run excerpt and tag tests
+    make eval-tags          # Evaluate tag suggestions
+    make eval-summary       # Evaluate post summaries
+    make open               # Open latest HTML report
+    ```
+    **Common targets**:
+    - `make eval` - Run full evaluation for all test types (excerpts, tags, summary)
+    - `make eval TESTS="excerpts"` - Run only specific test types
+    - `make eval-quick` - Fast evaluation (English excerpts only)
+    - `make rebuild-improve` - Regenerate HTML with mock improvements (for UI development)
+    - `make open` - Open latest evaluation report
+    - `make help` - Show all available commands
+    For advanced options and HTML report development, see:
+    - `Modules/Tests/WordPressIntelligenceTests/Makefile`
+    - `Modules/Tests/WordPressIntelligenceTests/lib/DEVELOPMENT.md`
+    ### Evaluation Output
+    Results are saved to `/tmp/WordPressIntelligence-Tests/evaluation-<timestamp>/`:
+    - **`evaluation-report.html`** - Interactive report with filtering, sorting, baseline comparison
+    - **`evaluation-results.json`** - Machine-readable data for CI/CD
+    - Console output with quick summary
+    **HTML Report Features**:
+    - Sortable columns (test name, status, score, duration)
+    - Filter by language, status, or comparison results
+    - Baseline comparison with delta indicators (↑ improved, ↓ regressed, = unchanged)
+    - Click any test to see detailed scores, generated content, and Claude feedback
+    - Score distribution dots (●●●) show pass/warn/fail for each excerpt
+    ### Scoring
+    Quality scores use weighted criteria (1-10 scale):
+    **Excerpt Generation**:
+    - Language Match (3.0×), Grammar (2.0×), Relevance (2.0×) - critical factors
+    - Hook Quality (1.5×), Key Info (1.5×), Length, Style, Standalone, Engagement (1.0× each)
+    - Diversity: structural, angle, length, lexical variation
+    **Pass criteria**: Overall ≥ 7.0 AND no critical failures
+    **Needs Improvement**: 6.0-6.9 OR any score < 4.0
+    **Failed**: Language < 8.0 OR Grammar < 6.0 OR Overall < 6.0
+    *Note: Tag and summary evaluations use different criteria optimized for their use cases.*
+    ## Extending Tests
+    ### Adding Test Cases
+. Add test data to `lib/config.py`:
+    ```python
+    "new_test_case": TestConfig(
+        original_content="...",
+        language="english",
+        # ... other parameters
+    )
+    ```
+. Update `Makefile` if adding new test type:
+    ```makefile
+    eval-newtype:
+        @./lib/evaluate-with-claude.sh --test-type newtype
+    ```
+    ### Customizing Evaluation Criteria
+    Edit scoring logic in `lib/evaluators.py`. Each test type has its own evaluator class with weighted criteria and thresholds.
+    ### Developing HTML Report
+    For fast iteration on HTML report UI without re-running tests:
+    ```bash
+    make rebuild-improve    # Regenerate with mock improvements
+    # Edit lib/evaluation-viewer.html
+    make rebuild-improve    # Instant preview
+    ```
+    See `lib/DEVELOPMENT.md` for complete HTML development workflow.
+    ## Troubleshooting
+    **Tests skipped**: Missing iOS 26 or Apple Intelligence support
+    **Language issues**: Check prompt in `Sources/WordPressIntelligence/ExcerptGeneration.swift`
+    **Evaluation fails**: Install/configure Claude CLI: `pip install claude-cli && claude configure`
+    See `CLAUDE.md` for project development guidelines.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add LLM-based evaluation for WordPressIntelligence #25059

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Add LLM-based evaluation for WordPressIntelligence #25059

Are you sure you want to change the base?

Uh oh!

Add LLM-based evaluation for WordPressIntelligence #25059

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!