05-24-2022, 09:50 AM
Stage Two: We now get all the wiki page names from our two files successfully.
Code: (Select All)
$Console:Only
DefLng A-Z
Const HomePage$ = "https://qb64phoenix.com"
ReDim Shared PageNames(10000) As String
NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.
'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code
PageCount = CountPages(NumberOfPageLists)
Print PageCount
For i = 1 To PageCount
Print i; "of"; PageCount, PageNames(i)
Sleep
Next
Function CountPages (NumberOfPageLists)
FileLeft$ = "Page List("
FileRight$ = ").txt"
For i = 1 To NumberOfPageLists
file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$
Open file$ For Binary As #1
l = LOF(1): t$ = Space$(l)
Get #1, 1, t$
Close #1
start = InStr(t$, "<ul") 'skip down to the part of the page with the page listins
finish = InStr(start, t$, "</ul") 'and we can quit parsing when we get down to this point
p = start 'current position in file we're parsing
Do Until p > finish
p = InStr(p, t$, "<li><a href=") + 13
If p = 13 Then Exit Do 'we've parsed all the lists from the page. No need to keep going
p2 = InStr(p, t$, Chr$(34))
count = count + 1
PageNames(count) = Mid$(t$, p, p2 - p)
Loop
Next
CountPages = count
ReDim _Preserve PageNames(count) As String
End Function
Function DownloadPageLists
FileLeft$ = "Page List("
FileRight$ = ").txt"
FileCount = 1
CurrentFile$ = ""
url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download
Do
file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$
Download url$, file$
url2$ = GetNextPage$(file$)
P = InStr(url2$, "from=")
If P = 0 Then Exit Do
If Mid$(url2$, P + 5) > CurrentFile$ Then
CurrentFile$ = Mid$(url2$, P + 5)
FileCount = FileCount + 1
url$ = url2$
Else
Exit Do
End If
Loop
DownloadPageLists = FileCount
End Function
Function CleanHTML$ (OriginalText$)
text$ = OriginalText$ 'don't corrupt incoming text
Type ReplaceList
original As String
replacement As String
End Type
'Expandable HTML replacement system
Dim HTML(1) As ReplaceList
HTML(0).original = "&": HTML(0).replacement = "&"
HTML(1).original = "%24": HTML(1).replacement = "$"
For i = 0 To UBound(HTML)
Do
P = InStr(text$, HTML(i).original)
If P = 0 Then Exit Do
text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))
Loop
Next
CleanHTML$ = text$
End Function
Sub Download (url$, outputFile$)
url2$ = CleanHTML(url$)
'Print "https://qb64phoenix.com/qb64wiki/index.php?title=Special:AllPages&from=KEY+n"
'Print HomePage$ + url2$
Shell "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)
End Sub
Function GetNextPage$ (currentPage$)
SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"
SpecialPageLink$ = "<a href="
SpecialPageEndLink$ = Chr$(34) + " title"
Open currentPage$ For Binary As #1
l = LOF(1)
t$ = Space$(l)
Get #1, 1, t$
Close
sp = InStr(t$, SpecialPageDivClass$)
If sp Then
lp = InStr(sp, t$, SpecialPageLink$)
If lp Then
lp = lp + 9
lp2 = InStr(lp, t$, SpecialPageEndLink$)
link$ = Mid$(t$, lp, lp2 - lp)
GetNextPage$ = CleanHTML(link$)
End If
End If
End Function