Removed outputSource filename when writing file to new location and u…

…sing internal SparkContext values
AbsaOSS · rkrumins · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025
commit 5f63f61e8032c408615819cb36e4657b83cb2aa6
diff --git a/core/src/main/resources/spline.default.yaml b/core/src/main/resources/spline.default.yaml
@@ -151,22 +151,23 @@ spline:
       # If left empty, null, or not specified → DEFAULT MODE: lineage written alongside target data files
       # If set to a path → CENTRALIZED MODE: all lineage written to this location with unique filenames
       #
-      # CENTRALIZED MODE filename format: {timestamp}_{fileName}_{appId}
+      # CENTRALIZED MODE filename format: {timestamp}_{appName}_{appId}
       # - timestamp: Human-readable UTC timestamp (yyyy-MM-dd_HH-mm-ss-SSS) for natural chronological sorting and easy filtering
       #   Example: 2025-10-12_14-30-45-123
-      # - fileName: The configured fileName value (e.g., "my_file.parq_LINEAGE")
+      # - appName: Spark application name for easy identification of which job generated the lineage
+      #   Example: MySparkJob
       # - appId: Spark application ID for traceability to specific runs
       #   Example: app-20251012143045-0001
       #
-      # More examples:
+      # Examples (assuming app name is "MySparkJob"):
       # - Local: customLineagePath: /my/centralized/lineage
-      #   Output: /my/centralized/lineage/2025-10-12_14-30-45-123_my_file.parq_LINEAGE_app-20251012143045-0001
+      #   Output: /my/centralized/lineage/2025-10-12_14-30-45-123_MySparkJob_app-20251012143045-0001
       # - S3: customLineagePath: s3://my-bucket/lineage
-      #   Output: s3://my-bucket/lineage/2025-10-12_14-30-45-123_my_file.parq_LINEAGE_app-20251012143045-0001
+      #   Output: s3://my-bucket/lineage/2025-10-12_14-30-45-123_MySparkJob_app-20251012143045-0001
       # - GCS: customLineagePath: gs://my-bucket/lineage
-      #   Output: gs://my-bucket/lineage/2025-10-12_14-30-45-123_my_file.parq_LINEAGE_app-20251012143045-0001
+      #   Output: gs://my-bucket/lineage/2025-10-12_14-30-45-123_MySparkJob_app-20251012143045-0001
       # - HDFS: customLineagePath: hdfs://cluster/lineage
-      #   Output: hdfs://cluster/lineage/2025-10-12_14-30-45-123_my_file.parq_LINEAGE_app-20251012143045-0001
+      #   Output: hdfs://cluster/lineage/2025-10-12_14-30-45-123_MySparkJob_app-20251012143045-0001
     # -------------------------------------------
     # Open Lineage HTTP dispatcher
     # -------------------------------------------

diff --git a/core/src/main/scala/za/co/absa/spline/harvester/dispatcher/HDFSLineageDispatcher.scala b/core/src/main/scala/za/co/absa/spline/harvester/dispatcher/HDFSLineageDispatcher.scala
@@ -51,9 +51,9 @@ import scala.concurrent.blocking
  *
  * 2. CENTRALIZED MODE (customLineagePath set to a valid path):
  *    All lineage files are written to a single centralized location with unique filenames.
- *    Filename format: {timestamp}_{fileName}_{appId}
+ *    Filename format: {timestamp}_{appName}_{appId}
  *    - timestamp: Human-readable UTC timestamp (yyyy-MM-dd_HH-mm-ss-SSS) for chronological sorting and filtering
- *    - fileName: The configured fileName value (e.g., "my_file.parq_LINEAGE")
+ *    - appName: Spark application name for easy identification
  *    - appId: Spark application ID for traceability
  *
  *    The timestamp-first format ensures natural chronological sorting and easy date-based filtering.
@@ -87,7 +87,7 @@ class HDFSLineageDispatcher(filename: String, permission: FsPermission, bufferSi
       throw new IllegalStateException("send(event) must be called strictly after send(plan) method with matching plan ID")
 
     try {
-      val path = resolveLineagePath(event.planId.toString)
+      val path = resolveLineagePath()
       val planWithEvent = Map(
         "executionPlan" -> this._lastSeenPlan,
         "executionEvent" -> event
@@ -108,6 +108,7 @@ class HDFSLineageDispatcher(filename: String, permission: FsPermission, bufferSi
    * @return The full path where the lineage file should be written
    */
   private def resolveLineagePath(): String = {
+    val outputSource = s"${this._lastSeenPlan.operations.write.outputSource}"
     customLineagePath match {
       case Some(customPath) =>
         // Centralized mode: write to custom path with unique filename
@@ -116,28 +117,30 @@ class HDFSLineageDispatcher(filename: String, permission: FsPermission, bufferSi
         s"$cleanCustomPath/$uniqueFilename"
       case None =>
         // Default mode: write alongside target data file
-        s"${this._lastSeenPlan.operations.write.outputSource.stripSuffix("/")}/$filename"
+        s"${outputSource.stripSuffix("/")}/$filename"
     }
   }
 
   /**
    * Generates a unique filename for centralized lineage storage.
    * 
-   * Format: {timestamp}_{fileName}_{appId}
-   * Example: 2025-10-12_14-30-45-123_lineage_app-20251012143045-0001
+   * Format: {timestamp}_{appName}_{appId}
+   * Example: 2025-10-12_14-30-45-123_MySparkJob_app-20251012143045-0001
    *
    * This format optimizes for operational debugging use cases:
    * - Timestamp FIRST: Ensures natural chronological sorting (most recent files appear together)
+   * - Application Name: Easy identification of which job generated the lineage
    * - Application ID: Full traceability to specific Spark application run
    *
    * @return A unique filename optimized for filtering and sorting
    */
   private def generateUniqueFilename(): String = {
     val sparkContext = SparkContext.getOrCreate()
+    val appName = sparkContext.appName
     val appId = sparkContext.applicationId
     val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss-SSS").withZone(ZoneId.of("UTC"))
     val timestamp = dateFormatter.format(Instant.now())
-    s"${timestamp}_${filename}_${appId}"
+    s"${timestamp}_${appName}_${appId}"
   }
-  private def generateUniqueFilename(): String = {
-    val sparkContext = SparkContext.getOrCreate()
-    val appName = sparkContext.appName
-    val appId = sparkContext.applicationId
-    val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss-SSS").withZone(ZoneId.of("UTC"))
-    val timestamp = dateFormatter.format(Instant.now())
-    s"${timestamp}_${appName}_${appId}"
-  }
+  private def generateUniqueFilename(): String = {
+    val sparkContext = SparkContext.getOrCreate()
+    val appName = sparkContext.appName.replaceAll("[^a-zA-Z0-9_-]", "_")
+    val appId   = sparkContext.applicationId.replaceAll("[^a-zA-Z0-9_-]", "_")
+    val dateFormatter = DateTimeFormatter
+      .ofPattern("yyyy-MM-dd_HH-mm-ss-SSS")
+      .withZone(ZoneId.of("UTC"))
+    val timestamp = dateFormatter.format(Instant.now())
+    s"${timestamp}_${appName}_${appId}"
+  }
-  private def generateUniqueFilename(): String = {
-    val sparkContext = SparkContext.getOrCreate()
-    val appName = sparkContext.appName
-    val appId = sparkContext.applicationId
-    val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss-SSS").withZone(ZoneId.of("UTC"))
-    val timestamp = dateFormatter.format(Instant.now())
-    s"${timestamp}_${appName}_${appId}"
-  }
+  private def generateUniqueFilename(): String = {
+    val sparkContext = SparkContext.getOrCreate()
+    val appName = sparkContext.appName.replaceAll("[^a-zA-Z0-9_-]", "_")
+    val appId   = sparkContext.applicationId.replaceAll("[^a-zA-Z0-9_-]", "_")
+    val dateFormatter = DateTimeFormatter
+      .ofPattern("yyyy-MM-dd_HH-mm-ss-SSS")
+      .withZone(ZoneId.of("UTC"))
+    val timestamp = dateFormatter.format(Instant.now())
+    s"${timestamp}_${appName}_${appId}"
+  }
 
   /**

diff --git a/...sts/src/test/scala/za/co/absa/spline/harvester/dispatcher/HDFSLineageDispatcherSpec.scala b/...sts/src/test/scala/za/co/absa/spline/harvester/dispatcher/HDFSLineageDispatcherSpec.scala
@@ -43,7 +43,7 @@ class HDFSLineageDispatcherSpec
   val lineageDispatcherConfigCustomLineagePathKeyName = s"$lineageDispatcherConfigKeyName.$lineageDispatcherConfigValueName.customLineagePath"
   val destFilePathExtension = ".parquet"
 
-  it should "save lineage file to a filesystem in DEFAULT mode (no customLineagePath)" taggedAs ignoreIf(ver"$SPARK_VERSION" < ver"2.3") in {
+  it should "save lineage file to a filesystem in DEFAULT mode" taggedAs ignoreIf(ver"$SPARK_VERSION" < ver"2.3") in {
     withIsolatedSparkSession(_
       .config(lineageDispatcherConfigKeyName, lineageDispatcherConfigValueName)
       .config(lineageDispatcherConfigClassNameKeyName, classOf[HDFSLineageDispatcher].getName)
@@ -98,10 +98,10 @@ class HDFSLineageDispatcherSpec
           val lineageFile = lineageFiles(0)
           lineageFile.length should be > 0L
 
-          // Verify filename format: {timestamp}_{fileName}_{appId}
+          // Verify filename format: {timestamp}_{appName}_{appId}
           val filename = lineageFile.getName
-          // Should match pattern: yyyy-MM-dd_HH-mm-ss-SSS__LINEAGE_app-...
-          filename should include("_LINEAGE_")
+          // Should match pattern: yyyy-MM-dd_HH-mm-ss-SSS_{appName}_app-...
+          // AppName and AppId are part of the filename
           filename should startWith regex """\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}-\d{3}"""
 
           val lineageJson = readFileToString(lineageFile, "UTF-8").fromJson[Map[String, Map[String, _]]]