Refactoring tweets scrapping.

This commit is contained in:
Offray Vladimir Luna Cárdenas 2022-05-15 08:23:21 -05:00
parent 742bb97446
commit a61de2ecb7
7 changed files with 41 additions and 33 deletions

View File

@ -0,0 +1,33 @@
accessing
collectRawTweetsFrom: anUrl upToPage: anInteger
| pagesDict response customQuery |
pagesDict := self getPagesContentsFrom: anUrl upTo: anInteger.
response := TweetsCollection new.
customQuery := Dictionary new
at: 'parameters' put: pagesDict keys;
at: 'date' put: DateAndTime now;
yourself.
response query: customQuery.
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
collect: [ :xmlElement | xmlElement postCopy ].
temp do: [ :tweet | | tempTweet |
tempTweet := Tweet new fromNitterHtmlItem: tweet.
tempTweet metadata
at: DateAndTime now asString put: key;
yourself.
response add: tempTweet.
]
].
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
response messages doWithIndex: [ :tweet :i |
| current previous |
current := response messages at: i.
i < response lastIndex ifTrue: [
previous := response messages at: i + 1.
current timelines
at: self userName put: previous id;
yourself ]].
^ response.

View File

@ -1,33 +1,5 @@
accessing accessing
collectRawTweetsUpToPage: anInteger collectRawTweetsUpToPage: anInteger
| pagesDict response customQuery | ^ self collectRawTweetsFrom: self userNameLinkWithReplies upToPage: anInteger
pagesDict := self getPagesContentsUpto: anInteger.
response := TweetsCollection new.
customQuery := Dictionary new
at: 'parameters' put: pagesDict keys;
at: 'date' put: DateAndTime now;
yourself.
response query: customQuery.
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
collect: [ :xmlElement | xmlElement postCopy ].
temp do: [ :tweet | | tempTweet |
tempTweet := Tweet new fromNitterHtmlItem: tweet.
tempTweet metadata
at: DateAndTime now asString put: key;
yourself.
response add: tempTweet.
]
].
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
response messages doWithIndex: [ :tweet :i |
| current previous |
current := response messages at: i.
i < response lastIndex ifTrue: [
previous := response messages at: i + 1.
current timelines
at: self userName put: previous id;
yourself ]].
^ response.

View File

@ -1,3 +1,3 @@
operation operation
documentTree documentTree
^ self documentTreeFor: (self userNameLink, '/with_replies') ^ self documentTreeFor: self userNameLinkWithReplies

View File

@ -1,5 +1,5 @@
accessing accessing
getPagesContentsFrom: anURL Upto: anInteger getPagesContentsFrom: anURL upTo: anInteger
"I retroactively get all pages contents until a specified page number. "I retroactively get all pages contents until a specified page number.
TO DO: should this be splitted back to two methods, one getting the page urls and other its content? TO DO: should this be splitted back to two methods, one getting the page urls and other its content?

View File

@ -1,4 +1,4 @@
accessing accessing
getPagesContentsFromOldestUpto: anInteger getPagesContentsFromOldestUpto: anInteger
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) Upto: anInteger ^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) upTo: anInteger

View File

@ -5,4 +5,4 @@ getPagesContentsUpto: anInteger
TO DO: should this be splitted back to two methods, one getting the page urls and other its content? TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
or do we always be getting the cursor urls and its contents all the time. or do we always be getting the cursor urls and its contents all the time.
[ ] Benchmark alternative approaches." [ ] Benchmark alternative approaches."
^ self getPagesContentsFrom: (self userNameLink, '/with_replies') Upto: anInteger ^ self getPagesContentsFrom: self userNameLinkWithReplies upTo: anInteger

View File

@ -0,0 +1,3 @@
operation
userNameLinkWithReplies
^ self userNameLink, '/with_replies'