-
Notifications
You must be signed in to change notification settings - Fork 68
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
240 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
const { | ||
customConsoleLog, | ||
wait, | ||
waitForElement, | ||
bigStepper, | ||
} = require('../../preloadFunctions'); | ||
const { ipcRenderer } = require('electron'); | ||
const fs = require('fs'); | ||
const path = require('path'); | ||
|
||
async function checkIfPostExists(id, platformId, company, name, currentPost) { | ||
const userData = await ipcRenderer.invoke('get-user-data-path'); | ||
const filePath = path.join( | ||
userData, | ||
'surfer_data', | ||
company, | ||
name, | ||
platformId, | ||
`${platformId}.json`, | ||
); | ||
console.log(id, `Checking if file exists at ${filePath}`); | ||
const fileExists = await fs.existsSync(filePath); | ||
if (fileExists) { | ||
console.log(id, `File exists, reading file`); | ||
try { | ||
const fileContent = fs.readFileSync(filePath, 'utf-8'); | ||
if (fileContent.trim() === '') { | ||
console.log(id, 'File is empty'); | ||
return false; | ||
} | ||
const posts = JSON.parse(fileContent); | ||
console.log(id, 'Posts: ', posts); | ||
if (posts && posts.content && Array.isArray(posts.content)) { | ||
for (const post of posts.content) { | ||
if ( | ||
post.timestamp === currentPost.timestamp && | ||
post.text === currentPost.text | ||
) { | ||
console.log(id, 'Post already exists, skipping'); | ||
return true; | ||
} | ||
} | ||
} else { | ||
console.log(id, 'Invalid or empty posts structure'); | ||
} | ||
} catch (error) { | ||
console.error(id, `Error reading or parsing file: ${error.message}`); | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
async function exportFeed(id, platformId, filename, company, name) { | ||
if (!window.location.href.includes('x.com')) { | ||
bigStepper(id, 'Navigating to Twitter'); | ||
customConsoleLog(id, 'Navigating to Twitter'); | ||
window.location.assign('https://x.com/'); | ||
} | ||
await wait(5); | ||
|
||
if (document.body.innerText.toLowerCase().includes('sign in to x')) { | ||
bigStepper(id, 'Export stopped, waiting for sign in'); | ||
customConsoleLog( | ||
id, | ||
'YOU NEED TO SIGN IN (click the eye in the top right)!', | ||
); | ||
ipcRenderer.send('connect-website', id); | ||
return 'CONNECT_WEBSITE'; | ||
} | ||
|
||
bigStepper(id, 'Getting feed posts...'); | ||
customConsoleLog(id, 'Starting feed collection'); | ||
|
||
const feedArray = []; | ||
let noNewPostsCount = 0; | ||
|
||
while (feedArray.length < 100 && noNewPostsCount < 3) { | ||
const posts = await waitForElement( | ||
id, | ||
'div[data-testid="cellInnerDiv"]', | ||
'Feed posts', | ||
true, | ||
); | ||
customConsoleLog(id, `Found ${posts.length} posts on the page`); | ||
|
||
if (posts.length === 0) { | ||
customConsoleLog(id, 'No posts found, waiting 2 seconds before retry'); | ||
await wait(2); | ||
noNewPostsCount++; | ||
continue; | ||
} | ||
|
||
customConsoleLog(id, 'Processing new posts'); | ||
const initialSize = feedArray.length; | ||
|
||
for (const post of posts) { | ||
if (feedArray.length >= 100) break; | ||
|
||
post.scrollIntoView({ | ||
behavior: 'instant', | ||
block: 'end', | ||
}); | ||
|
||
if (post.querySelector('time')) { | ||
const jsonPost = { | ||
text: post.innerText.replace(/\n/g, ' '), | ||
timestamp: post.querySelector('time').getAttribute('datetime'), | ||
author: | ||
post.querySelector('div[data-testid="User-Name"]')?.innerText || | ||
'Unknown', | ||
}; | ||
|
||
if ( | ||
!feedArray.some( | ||
(p) => | ||
p.timestamp === jsonPost.timestamp && p.text === jsonPost.text, | ||
) | ||
) { | ||
const postExists = await checkIfPostExists( | ||
id, | ||
platformId, | ||
company, | ||
name, | ||
jsonPost, | ||
); | ||
|
||
if (postExists) { | ||
customConsoleLog(id, 'Post already exists, skipping'); | ||
continue; | ||
} else { | ||
ipcRenderer.send( | ||
'handle-update', | ||
company, | ||
name, | ||
platformId, | ||
JSON.stringify(jsonPost), | ||
id, | ||
); | ||
feedArray.push(jsonPost); | ||
} | ||
} | ||
} | ||
} | ||
|
||
const newPostsAdded = feedArray.length - initialSize; | ||
customConsoleLog( | ||
id, | ||
`Added ${newPostsAdded} new unique posts. Total: ${feedArray.length}`, | ||
); | ||
|
||
if (newPostsAdded === 0) { | ||
customConsoleLog(id, 'NO NEW POSTS ADDED, TRYING AGAIN!'); | ||
noNewPostsCount++; | ||
} else { | ||
noNewPostsCount = 0; | ||
} | ||
|
||
customConsoleLog(id, 'Waiting 2 seconds before getting more posts'); | ||
await wait(2); | ||
} | ||
|
||
customConsoleLog(id, `Exporting ${feedArray.length} feed posts`); | ||
bigStepper(id, 'Exporting data'); | ||
ipcRenderer.send('handle-update-complete', id, platformId, company, name); | ||
return 'HANDLE_UPDATE_COMPLETE'; | ||
} | ||
|
||
module.exports = exportFeed; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"name": "Twitter Feed", | ||
"description": "Exports 100 posts in you feed.", | ||
"isUpdated": true, | ||
"logoURL": "https://logo.clearbit.com/twitter.com", | ||
"connectURL": "https://twitter.com", | ||
"connectSelector": "img.css-9pa8cd" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Twitter Feed Scraper | ||
|
||
This scraper extracts the latest 100 posts from your Twitter feed. | ||
|
||
## Features | ||
|
||
- Automatically navigates to Twitter | ||
- Checks for user authentication | ||
- Scrolls through the feed to collect posts | ||
- Extracts post text, timestamp, and author | ||
- Saves data in JSON format | ||
|
||
## Usage | ||
|
||
1. Ensure you're logged into Twitter in the Electron browser | ||
2. Run the scraper | ||
3. Wait for the scraper to collect 100 posts or reach the end of available new posts | ||
4. The collected data will be saved in the specified JSON file | ||
|
||
## Output Format | ||
|
||
The scraper saves the data in the following format: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters