-
-
Notifications
You must be signed in to change notification settings - Fork 13
[facebook 1] feat: import wall posts from Meta export #405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 29 commits
8c38853
45a383f
fd500e0
35cd45c
8c02088
9c5ef80
b58504d
0a1969e
5f167ab
a5101e8
baf36c6
ddf84f9
bfa4452
33e08f3
b3cccec
508cd66
9c32bf4
c030bfd
75230d0
979b6bd
f6a1625
d9e2a29
1798996
5735f04
fcd4f18
a1ef2ea
4bf23ae
a4c437c
a543ccc
43c35d5
de64aa2
97c793a
b4e07d8
be9bb92
03daa4e
303b9d0
33d144c
73d8821
5ad60e4
26cc1fd
4b6a0fd
e2efbec
c35c739
21ef0e2
8244c09
3703766
80c1ea2
2a1fad7
66ae851
d2f23f6
0bf6090
49b66ce
dcb2078
a508edb
4211521
7bcf910
7160150
4121fb8
59f8ebe
5e91ecc
98ab0c6
f371cd3
9bdbd60
e0c1d34
ea6418d
e0c56ea
f7a5277
cba0d5e
0ef9809
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,13 @@ | ||
| import path from 'path' | ||
| import fs from 'fs' | ||
| import os from 'os' | ||
|
|
||
| import fetch from 'node-fetch'; | ||
| import { session } from 'electron' | ||
| import log from 'electron-log/main'; | ||
| import Database from 'better-sqlite3' | ||
| import unzipper from 'unzipper'; | ||
| import { glob } from 'glob'; | ||
|
|
||
| import { | ||
| getAccountDataPath, | ||
|
|
@@ -13,6 +17,7 @@ import { | |
| FacebookJob, | ||
| FacebookProgress, | ||
| emptyFacebookProgress, | ||
| FacebookImportArchiveResponse, | ||
| } from '../shared_types' | ||
| import { | ||
| runMigrations, | ||
|
|
@@ -25,6 +30,8 @@ import { IMITMController } from '../mitm'; | |
| import { | ||
| FacebookJobRow, | ||
| convertFacebookJobRowToFacebookJob, | ||
| FacebookArchivePost, | ||
| FacebookPostRow | ||
| } from './types' | ||
|
|
||
| export class FacebookAccountController { | ||
|
|
@@ -113,7 +120,7 @@ export class FacebookAccountController { | |
| } | ||
|
|
||
| // Make sure the account data folder exists | ||
| this.accountDataPath = getAccountDataPath('X', this.account.name); | ||
| this.accountDataPath = getAccountDataPath("Facebook", `${this.account.accountID} ${this.account.name}`); | ||
| log.info(`FacebookAccountController.initDB: accountDataPath=${this.accountDataPath}`); | ||
|
|
||
| // Open the database | ||
|
|
@@ -138,7 +145,19 @@ export class FacebookAccountController { | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | ||
| key TEXT NOT NULL UNIQUE, | ||
| value TEXT NOT NULL | ||
| );` | ||
| );`] | ||
| }, | ||
| { | ||
| name: "20250220_add_post_table", | ||
| sql: [ | ||
| `CREATE TABLE post ( | ||
| id INTEGER PRIMARY KEY AUTOINCREMENT, | ||
| postID TEXT NOT NULL UNIQUE, | ||
| createdAt DATETIME NOT NULL, | ||
| title TEXT, | ||
| text TEXT, | ||
| addedToDatabaseAt DATETIME NOT NULL | ||
| );` | ||
| ] | ||
| }, | ||
| ]) | ||
|
|
@@ -220,7 +239,7 @@ export class FacebookAccountController { | |
| } | ||
| const buffer = await response.buffer(); | ||
| log.info("FacebookAccountController.getProfileImageDataURI: buffer", buffer); | ||
| return `data:${response.headers.get('content-type')};base64,${buffer.toString('base64')}`; | ||
| return `data: ${response.headers.get('content-type')}; base64, ${buffer.toString('base64')}`; | ||
| } catch (e) { | ||
| log.error("FacebookAccountController.getProfileImageDataURI: error", e); | ||
| return ""; | ||
|
|
@@ -234,4 +253,264 @@ export class FacebookAccountController { | |
| async setConfig(key: string, value: string) { | ||
| return setConfig(key, value, this.db); | ||
| } | ||
| } | ||
|
|
||
| // Unzip facebook archive to the account data folder using unzipper | ||
| // Return null if error, else return the unzipped path | ||
| async unzipFacebookArchive(archiveZipPath: string): Promise<string | null> { | ||
| if (!this.account) { | ||
| return null; | ||
| } | ||
| const unzippedPath = path.join(getAccountDataPath("Facebook", `${this.account.accountID} ${this.account.name}`), "tmp"); | ||
|
|
||
| const archiveZip = await unzipper.Open.file(archiveZipPath); | ||
| await archiveZip.extract({ path: unzippedPath }); | ||
|
|
||
| log.info(`FacebookAccountController.unzipFacebookArchive: unzipped to ${unzippedPath}`); | ||
|
|
||
| return unzippedPath; | ||
| } | ||
|
|
||
| // Delete the unzipped facebook archive once the build is completed | ||
| async deleteUnzippedFacebookArchive(archivePath: string): Promise<void> { | ||
| fs.rm(archivePath, { recursive: true, force: true }, err => { | ||
| if (err) { | ||
| log.error(`FacebookAccountController.deleteUnzippedFacebookArchive: Error occured while deleting unzipped folder: ${err} `); | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| // Return null on success, and a string (error message) on error | ||
| async verifyFacebookArchive(archivePath: string): Promise<string | null> { | ||
| // If archivePath contains just one folder and no files, update archivePath to point to that inner folder | ||
| const archiveContents = fs.readdirSync(archivePath); | ||
| if (archiveContents.length === 1 && fs.lstatSync(path.join(archivePath, archiveContents[0])).isDirectory()) { | ||
| archivePath = path.join(archivePath, archiveContents[0]); | ||
| } | ||
|
|
||
| const foldersToCheck = [ | ||
| archivePath, | ||
| path.join(archivePath, "personal_information", "profile_information"), | ||
| ]; | ||
|
|
||
| // Make sure folders exist | ||
| for (let i = 0; i < foldersToCheck.length; i++) { | ||
| if (!fs.existsSync(foldersToCheck[i])) { | ||
| log.error(`XAccountController.verifyXArchive: folder does not exist: ${foldersToCheck[i]} `); | ||
| return `The folder ${foldersToCheck[i]} doesn't exist.`; | ||
| } | ||
| } | ||
|
|
||
| // Check if there's a profile_information.html file. This means the person downloaded the archive using HTML, not JSON. | ||
| const profileHtmlInformationPath = path.join(archivePath, "personal_information/profile_information/profile_information.html"); | ||
| if (fs.existsSync(profileHtmlInformationPath)) { | ||
| log.error(`FacebookAccountController.verifyFacebookArchive: file is in wrong format, expected JSON, not HTML: ${profileHtmlInformationPath}`); | ||
| return `The file ${profileHtmlInformationPath} file is in the wrong format. Request a JSON archive.`; | ||
| } | ||
|
|
||
| // Make sure profile_information.json exists and is readable | ||
| const profileInformationPath = path.join(archivePath, "personal_information/profile_information/profile_information.json"); | ||
| if (!fs.existsSync(profileInformationPath)) { | ||
| log.error(`FacebookAccountController.verifyFacebookArchive: file does not exist: ${profileInformationPath}`); | ||
| return `The file ${profileInformationPath} doesn't exist.`; | ||
| } | ||
| try { | ||
| fs.accessSync(profileInformationPath, fs.constants.R_OK); | ||
| } catch { | ||
| log.error(`FacebookAccountController.verifyFacebookArchive: file is not readable: ${profileInformationPath}`); | ||
| return `The file ${profileInformationPath} is not readable.`; | ||
| } | ||
|
|
||
| // Make sure the profile_information.json file belongs to the right account | ||
| try { | ||
| const profileData = JSON.parse(fs.readFileSync(profileInformationPath, 'utf-8')); | ||
|
|
||
| if (!profileData.profile_v2?.profile_uri) { | ||
| log.error("FacebookAccountController.verifyFacebookArchive: Could not find profile URI in archive"); | ||
| return "Could not find profile ID in archive"; | ||
| } | ||
|
|
||
| const profileUrl = profileData.profile_v2.profile_uri; | ||
| const profileId = profileUrl.split('id=')[1]; | ||
|
|
||
| if (!profileId) { | ||
| log.error("FacebookAccountController.verifyFacebookArchive: Could not extract profile ID from URL"); | ||
| return "Could not extract profile ID from URL"; | ||
| } | ||
|
|
||
| if (profileId !== this.account?.accountID) { | ||
| log.error(`FacebookAccountController.verifyFacebookArchive: profile_information.json does not belong to the right account`); | ||
| return `This archive is for @${profileId}, not @${this.account?.accountID}.`; | ||
| } | ||
| } catch { | ||
| return "Error parsing JSON in profile_information.json"; | ||
| } | ||
|
|
||
| return null; | ||
| } | ||
|
|
||
| // Return null on success, and a string (error message) on error | ||
| async importFacebookArchive(archivePath: string, dataType: string): Promise<FacebookImportArchiveResponse> { | ||
| if (!this.db) { | ||
| this.initDB(); | ||
| } | ||
|
|
||
| let importCount = 0; | ||
| const skipCount = 0; | ||
|
|
||
| // If archivePath contains just one folder and no files, update archivePath to point to that inner folder | ||
| const archiveContents = fs.readdirSync(archivePath); | ||
| if (archiveContents.length === 1 && fs.lstatSync(path.join(archivePath, archiveContents[0])).isDirectory()) { | ||
| archivePath = path.join(archivePath, archiveContents[0]); | ||
| } | ||
|
|
||
| // Load the username | ||
| let profileId: string; | ||
|
|
||
|
|
||
| try { | ||
| const profileInformationPath = path.join(archivePath, "personal_information/profile_information/profile_information.json"); | ||
| const profileData = JSON.parse(fs.readFileSync(profileInformationPath, 'utf-8')); | ||
|
|
||
| if (!profileData.profile_v2?.profile_uri) { | ||
| return { | ||
| status: "error", | ||
| errorMessage: "Could not find profile URI in archive", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
|
|
||
| const profileUrl = profileData.profile_v2.profile_uri; | ||
| profileId = profileUrl.split('id=')[1] || ''; | ||
|
|
||
| if (!profileId) { | ||
| return { | ||
| status: "error", | ||
| errorMessage: "Could not extract profile ID from URL", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
| } catch (e) { | ||
| return { | ||
| status: "error", | ||
| errorMessage: "Error parsing profile information JSON", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
|
|
||
| // Import posts | ||
| if (dataType == "posts") { | ||
| const postsFilenames = await glob( | ||
| [ | ||
| // TODO: for really big Facebook archives, are there more files here? | ||
| path.join(archivePath, "your_facebook_activity", "posts", "your_posts__check_ins__photos_and_videos_1.json"), | ||
| ], | ||
| { | ||
| windowsPathsNoEscape: os.platform() == 'win32' | ||
| } | ||
| ); | ||
| if (postsFilenames.length === 0) { | ||
| return { | ||
| status: "error", | ||
| errorMessage: "No posts files found", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
|
|
||
| // Go through each file and import the posts | ||
| for (let i = 0; i < postsFilenames.length; i++) { | ||
| const postsData: FacebookArchivePost[] = []; | ||
| try { | ||
| const postsFile = fs.readFileSync(postsFilenames[i], 'utf8'); | ||
| const posts = JSON.parse(postsFile); | ||
|
|
||
| for (const post of posts) { | ||
| // Skip if no post text | ||
| const postText = post.data?.find((d: { post?: string }) => 'post' in d && typeof d.post === 'string')?.post; | ||
| if (!postText) { | ||
| log.info("FacebookAccountController.importFacebookArchive: skipping post with no text"); | ||
| continue; | ||
| } | ||
|
|
||
| // Check if it's a shared post by looking for external_context in attachments | ||
| const isSharedPost = post.attachments?.[0]?.data?.[0]?.external_context !== undefined; | ||
|
||
|
|
||
| // Skip if it's a shared/repost, group post, shares a group, etc. We will extend the import logic | ||
| // to include other data types in the future. | ||
| if (isSharedPost) { | ||
| log.info("FacebookAccountController.importFacebookArchive: skipping shared post"); | ||
| continue; | ||
| } | ||
| else if (post.attachments) { | ||
| log.info("FacebookAccountController.importFacebookArchive: skipping unknown post type"); | ||
| continue; | ||
| } | ||
|
|
||
| postsData.push({ | ||
| id_str: post.timestamp.toString(), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think using the timestamp as an id is reasonable since (hopefully) this will be unique. But the bigger concern for me is, since this actually isn't the real ID of the post, how will we use the info we're storing to delete this post in the future, if we choose to? For example, in my test account, if I click on a post it loads this URL:
The When I grep my FB archive for this string, there's nothing. So once we get to the point where we choose what posts to delete, how do we choose to delete this one if we can't build the URL to load it? I don't think this is something we need to solve in this PR, but I do think it makes sense to consider what we actually get get from the archive. If we don't have real post IDs (or rather, story IDs, as it seems FB calls them) then maybe the deletion options just have to be like "delete all posts without media", "delete all videos", "delete all photos", etc., instead of basing it on reactions like we do with X.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see how we can get the post ID from the archive so we may need to do the latter. We could also provide options to delete different types of content in a specified time period, e.g. "Delete all posts more than 2 years old" |
||
| title: post.title || '', | ||
| full_text: postText, | ||
| created_at: new Date(post.timestamp * 1000).toISOString(), | ||
| }); | ||
| } | ||
| } catch (e) { | ||
| return { | ||
| status: "error", | ||
| errorMessage: "Error parsing JSON in exported posts", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
|
|
||
| // Loop through the posts and add them to the database | ||
| try { | ||
| postsData.forEach((post) => { | ||
| // Is this post already there? | ||
| const existingPost = exec(this.db, 'SELECT * FROM post WHERE postID = ?', [post.id_str], "get") as FacebookPostRow; | ||
| if (existingPost) { | ||
| // Delete the existing post to re-import | ||
| exec(this.db, 'DELETE FROM post WHERE postID = ?', [post.id_str]); | ||
| } | ||
|
|
||
| // TODO: implement media import for facebook | ||
| // TODO: implement urls import for facebook | ||
|
|
||
| // Import it | ||
| exec(this.db, 'INSERT INTO post (postID, createdAt, title, text, addedToDatabaseAt) VALUES (?, ?, ?, ?, ?)', [ | ||
| post.id_str, | ||
| new Date(post.created_at), | ||
| post.title, | ||
| post.full_text, | ||
| new Date(), | ||
| ]); | ||
| importCount++; | ||
| }); | ||
| } catch (e) { | ||
| return { | ||
| status: "error", | ||
| errorMessage: "Error importing posts: " + e, | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
| } | ||
|
|
||
| return { | ||
| status: "success", | ||
| errorMessage: "", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
|
|
||
| return { | ||
| status: "error", | ||
| errorMessage: "Invalid data type.", | ||
| importCount: importCount, | ||
| skipCount: skipCount, | ||
| }; | ||
| } | ||
| } | ||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably. We're going to need people with big Facebook accounts to help us test this. I actually discovered the same problem with twitter, that it splits JSON files into multiple files for people with like 100k tweets. We should ask around.