Ex. (49) - Optimized Malware Scanner with extension filters and specific content search is presented here in Python. This version of the hex-signature scanner integrates the Aho-Corasick multi-pattern matcher with an interactive graphical interface, extension-based filtering, and progress reporting, making the scan process both efficient and user-friendly. This code sample is one entry in a suite of 127 algorithms set out in Antivirus Engines: From Methods to Innovations, Design, and Applications (Elsevier Syngress, 2024).
The main.db signature file can be extended with new entries extracted manually using the Hexadecimal Signature Extractor. This utility quickly retrieves the first bytes of files and converts them into aligned hexadecimal format, ready to be added directly into the signature database used by the Aho-Corasick algorithm.
Note: This project is closely related to other applications in the same suite.
You may also want to check out:
In this version of the hex-signature scanner, the implementation uses the Aho-Corasick automaton (ahocorasick) to load and match malware signatures efficiently. The l function builds the automaton from a signature file and returns both the matcher and the total count of loaded signatures, which is shown immediately in the GUI (“Total Signatures Loaded: c”). Scanning has been refined to examine only the first and last 1024 bytes of each file (header and footer), where malicious code is often embedded; the q function reports a match as soon as a signature is detected. The graphical interface (Tkinter) now includes a progress bar, a live results pane, a summary label, and an editable entry where users specify the file extensions to include (e.g., “.exe,.dll,.docx”). The “Browse Folder and Scan” action remains disabled until a signature set is loaded (i.e. main.db, ~30k of malware signatures), and message boxes guide the workflow (e.g., prompting to load signatures or enter extensions). After scanning, a concise summary reports the number of infected and clean files, providing immediate, user-friendly feedback while improving overall scanning performance.
import os
import ahocorasick
import tkinter as t
from tkinter import filedialog as f, scrolledtext as s, ttk as k, messagebox as m
def l(sf):
a = ahocorasick.Automaton()
c = 0
with open(sf, 'r') as x:
for y in x:
p = y.strip().split('=')
if len(p) == 2:
c += 1
n, h = p
a.add_word(h, (n,))
a.make_automaton()
return a, c
def q(fp, a):
try:
with open(fp, 'rb') as x:
d = x.read()
h = d[:1024].hex()
ftr = d[-1024:].hex()
for z in [h, ftr]:
for j, sgn in a.iter(z):
return True, f"File: {fp}\nMalicious! ^\nSignature found: {sgn}\n"
return False, f"File: {fp}\nAppears to be clean!\n"
except Exception as e:
return False, f"File: {fp}\nStatus: Error processing file - {e}\n"
def r():
a = None
c = 0
R = t.Tk()
R.title('A HEX Signature Scanner (Gagniuc - Scut AV)')
def L():
nonlocal a, c
sf = f.askopenfilename(
title="Select Signature File",
filetypes=(("Text files", "*.db"), ("All files", "*.*"))
)
if sf:
a, c = l(sf)
B2.config(state=t.NORMAL) # Schimbat aici!
D.config(text=f"Total Signatures Loaded: {c}")
def Q():
nonlocal a
if not a:
m.showinfo("Information", "Load a signature file before scanning.")
return
x = {z.strip() for z in G.get().split(',')}
if not x:
m.showinfo("Information", "Enter file extensions to scan.")
return
y = f.askdirectory()
if y:
H.delete(1.0, t.END)
F = []
for X, _, Y in os.walk(y):
F.extend(os.path.join(X, z) for z in Y if any(z.endswith(e) for e in x))
I['maximum'] = len(F)
ic = 0
cc = 0
for fp in F:
inf, res = q(fp, a)
H.insert(t.END, f"{res}\n")
H.update_idletasks()
I['value'] += 1
I.update_idletasks()
if inf:
ic += 1
else:
cc += 1
J.config(text=f"Scan Summary: {ic} Infected, {cc} Clean")
B = t.Button(R, text="Load Signature", command=L)
B.pack(pady=10)
D = t.Label(R, text="Total Signatures Loaded: 0")
D.pack(pady=10)
G = t.Entry(R)
G.pack(pady=10)
G.insert(0, ".exe,.dll,.docx")
B2 = t.Button(R,
text="Browse Folder and Scan",
command=Q,
state=t.DISABLED)
B2.pack(pady=10)
I = k.Progressbar(R,
orient="horizontal",
length=300,
mode="determinate")
I.pack(pady=10)
J = t.Label(R, text="Scan Summary: Not started")
J.pack(pady=10)
H = s.ScrolledText(R,
wrap=t.WORD,
width=80,
height=20)
H.pack(pady=10, padx=10)
R.mainloop()
if __name__ == "__main__":
r()Output:
File: C:/Users/Paul/Desktop/C\BRAVOS.exe
Appears to be clean!
File: C:/Users/Paul/Desktop/C\de_ce_nu.dll
Appears to be clean!
File: C:/Users/Paul/Desktop/C\LaboratorATMClar.exe
Appears to be clean!
File: C:/Users/Paul/Desktop/C\LaboratorATMCript.exe
Appears to be clean!
File: C:/Users/Paul/Desktop/C\test 1.exe
Appears to be clean!
File: C:/Users/Paul/Desktop/C\test2.exe
Appears to be clean!
File: C:/Users/Paul/Desktop/C\test3.exe
Appears to be clean!
- Paul A. Gagniuc. Antivirus Engines: From Methods to Innovations, Design, and Applications. Cambridge, MA: Elsevier Syngress, 2024. pp. 1-656.
