Refactoring tweets scrapping.
This commit is contained in:
parent
742bb97446
commit
a61de2ecb7
@ -0,0 +1,33 @@
|
|||||||
|
accessing
|
||||||
|
collectRawTweetsFrom: anUrl upToPage: anInteger
|
||||||
|
|
||||||
|
| pagesDict response customQuery |
|
||||||
|
pagesDict := self getPagesContentsFrom: anUrl upTo: anInteger.
|
||||||
|
response := TweetsCollection new.
|
||||||
|
customQuery := Dictionary new
|
||||||
|
at: 'parameters' put: pagesDict keys;
|
||||||
|
at: 'date' put: DateAndTime now;
|
||||||
|
yourself.
|
||||||
|
response query: customQuery.
|
||||||
|
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
|
||||||
|
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
|
||||||
|
collect: [ :xmlElement | xmlElement postCopy ].
|
||||||
|
temp do: [ :tweet | | tempTweet |
|
||||||
|
tempTweet := Tweet new fromNitterHtmlItem: tweet.
|
||||||
|
tempTweet metadata
|
||||||
|
at: DateAndTime now asString put: key;
|
||||||
|
yourself.
|
||||||
|
response add: tempTweet.
|
||||||
|
]
|
||||||
|
].
|
||||||
|
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
|
||||||
|
response messages doWithIndex: [ :tweet :i |
|
||||||
|
| current previous |
|
||||||
|
current := response messages at: i.
|
||||||
|
i < response lastIndex ifTrue: [
|
||||||
|
previous := response messages at: i + 1.
|
||||||
|
current timelines
|
||||||
|
at: self userName put: previous id;
|
||||||
|
yourself ]].
|
||||||
|
^ response.
|
||||||
|
|
@ -1,33 +1,5 @@
|
|||||||
accessing
|
accessing
|
||||||
collectRawTweetsUpToPage: anInteger
|
collectRawTweetsUpToPage: anInteger
|
||||||
|
|
||||||
| pagesDict response customQuery |
|
^ self collectRawTweetsFrom: self userNameLinkWithReplies upToPage: anInteger
|
||||||
pagesDict := self getPagesContentsUpto: anInteger.
|
|
||||||
response := TweetsCollection new.
|
|
||||||
customQuery := Dictionary new
|
|
||||||
at: 'parameters' put: pagesDict keys;
|
|
||||||
at: 'date' put: DateAndTime now;
|
|
||||||
yourself.
|
|
||||||
response query: customQuery.
|
|
||||||
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
|
|
||||||
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
|
|
||||||
collect: [ :xmlElement | xmlElement postCopy ].
|
|
||||||
temp do: [ :tweet | | tempTweet |
|
|
||||||
tempTweet := Tweet new fromNitterHtmlItem: tweet.
|
|
||||||
tempTweet metadata
|
|
||||||
at: DateAndTime now asString put: key;
|
|
||||||
yourself.
|
|
||||||
response add: tempTweet.
|
|
||||||
]
|
|
||||||
].
|
|
||||||
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
|
|
||||||
response messages doWithIndex: [ :tweet :i |
|
|
||||||
| current previous |
|
|
||||||
current := response messages at: i.
|
|
||||||
i < response lastIndex ifTrue: [
|
|
||||||
previous := response messages at: i + 1.
|
|
||||||
current timelines
|
|
||||||
at: self userName put: previous id;
|
|
||||||
yourself ]].
|
|
||||||
^ response.
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
|||||||
operation
|
operation
|
||||||
documentTree
|
documentTree
|
||||||
^ self documentTreeFor: (self userNameLink, '/with_replies')
|
^ self documentTreeFor: self userNameLinkWithReplies
|
@ -1,5 +1,5 @@
|
|||||||
accessing
|
accessing
|
||||||
getPagesContentsFrom: anURL Upto: anInteger
|
getPagesContentsFrom: anURL upTo: anInteger
|
||||||
"I retroactively get all pages contents until a specified page number.
|
"I retroactively get all pages contents until a specified page number.
|
||||||
|
|
||||||
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
|
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
|
@ -1,4 +1,4 @@
|
|||||||
accessing
|
accessing
|
||||||
getPagesContentsFromOldestUpto: anInteger
|
getPagesContentsFromOldestUpto: anInteger
|
||||||
|
|
||||||
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) Upto: anInteger
|
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) upTo: anInteger
|
@ -5,4 +5,4 @@ getPagesContentsUpto: anInteger
|
|||||||
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
|
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
|
||||||
or do we always be getting the cursor urls and its contents all the time.
|
or do we always be getting the cursor urls and its contents all the time.
|
||||||
[ ] Benchmark alternative approaches."
|
[ ] Benchmark alternative approaches."
|
||||||
^ self getPagesContentsFrom: (self userNameLink, '/with_replies') Upto: anInteger
|
^ self getPagesContentsFrom: self userNameLinkWithReplies upTo: anInteger
|
@ -0,0 +1,3 @@
|
|||||||
|
operation
|
||||||
|
userNameLinkWithReplies
|
||||||
|
^ self userNameLink, '/with_replies'
|
Loading…
Reference in New Issue
Block a user