05-25-2022, 12:09 AM
An upgraded version 2.0 of this -- this now has the option to download readable HTML pages like what you got with the first version of this code, or you can toggle the flag and basically get the same text which is use for the IDE help system and stored in internal/help.
I may have overlooked a few small tweaks so that this will *perfectly* match the IDE format, but I don't think those will be very hard to sort out and adjust for, with the source as it stands now.
Below is basically an image of how an IDE formatted help page looks like, for those interested:
Code: (Select All)
$Console:Only
DefLng A-Z
Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download
Const HomePage$ = "https://qb64phoenix.com"
ReDim Shared PageNames(10000) As String
NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.
'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code
PageCount = CountPages(NumberOfPageLists)
Print PageCount
t# = Timer: t$ = Time$
For i = 1 To PageCount
Cls
Print "Downloading... (Started at: "; t$; ")"
FileName$ = Mid$(PageNames(i), _InStrRev(PageNames(i), "/") + 1)
FileName2$ = CleanHTML(FileName$)
Select Case FileName2$
Case "*": FileName2$ = "Multiply"
Case "/": FileName2$ = "Divide"
End Select
Print i; "of"; PageCount, FileName2$: _Display
If DownloadMethod = 2 Then 'download as IDE text file
url$ = "/qb64wiki/index.php?title=" + FileName$ + "&action=edit"
Download url$, FileName2$ + ".HTML"
HTMLtoText FileName2$ + ".HTML", FileName2$ + ".txt"
'Kill FileName2$ + ".HTML" 'I'll leave this for now, for comparison and reference. Remember to unremark it later.
Else 'download as HTML file for browser viewing
Download PageNames(i), FileName2$ + ".HTML"
End If
Next
_AutoDisplay
Print "FINISHED!! (Finsihed at: "; Time$; ")" + Chr$(13)
Print Using "##,###.## seconds to download everything on this PC."; Timer - t#
Function CountPages (NumberOfPageLists)
FileLeft$ = "Page List("
FileRight$ = ").txt"
For i = 1 To NumberOfPageLists
file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$
Open file$ For Binary As #1
t$ = Space$(LOF(1)): Get #1, 1, t$
Close #1
Do
p = InStr(p, t$, "<li><a href=") + 13
If p = 13 Then Exit Do 'we've parsed all the lists from the page. No need to keep going
p2 = InStr(p, t$, Chr$(34))
count = count + 1
PageNames(count) = Mid$(t$, p, p2 - p)
Loop
Next
CountPages = count
ReDim _Preserve PageNames(count) As String
End Function
Function DownloadPageLists
FileLeft$ = "Page List("
FileRight$ = ").txt"
FileCount = 1
CurrentFile$ = ""
url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download
Do
file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$
Download url$, file$
url2$ = GetNextPage$(file$)
P = InStr(url2$, "from=")
If P = 0 Then Exit Do
If Mid$(url2$, P + 5) > CurrentFile$ Then
CurrentFile$ = Mid$(url2$, P + 5)
FileCount = FileCount + 1
url$ = url2$
Else
Exit Do
End If
Loop
DownloadPageLists = FileCount
End Function
Function CleanHTML$ (OriginalText$)
text$ = OriginalText$ 'don't corrupt incoming text
Type ReplaceList
original As String
replacement As String
End Type
Dim HTML(255) As ReplaceList 'Expandable HTML replacement system
HTML(0).original = "&": HTML(0).replacement = "&"
HTML(1).original = "<": HTML(1).replacement = "<"
HTML(2).original = ">": HTML(2).replacement = ">"
HTML(3).original = "|": HTML(3).replacement = "|"
HTML(4).original = "π": HTML(4).replacement = Chr$(227)
HTML(5).original = "θ": HTML(5).replacement = Chr$(233)
HTML(6).original = "¹": HTML(6).replacement = Chr$(252)
HTML(7).original = """: HTML(7).replacement = Chr$(34)
HTML(8).original = "²": HTML(8).replacement = Chr$(253)
HTML(9).original = " ": HTML(9).replacement = Chr$(255)
HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"
HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"
HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"
HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"
HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"
HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"
HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"
For i = 17 To 255
HTML(i).original = "%" + Hex$(i)
HTML(i).replacement = Chr$(i)
Next
For i = 0 To UBound(HTML)
Do
P = InStr(text$, HTML(i).original)
If P = 0 Then Exit Do
text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))
Loop
Next
CleanHTML$ = text$
End Function
Sub Download (url$, outputFile$)
url2$ = CleanHTML(url$)
Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)
End Sub
Function GetNextPage$ (currentPage$)
SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"
SpecialPageLink$ = "<a href="
SpecialPageEndLink$ = Chr$(34) + " title"
Open currentPage$ For Binary As #1
l = LOF(1)
t$ = Space$(l)
Get #1, 1, t$
Close
sp = InStr(t$, SpecialPageDivClass$)
If sp Then
lp = InStr(sp, t$, SpecialPageLink$)
If lp Then
lp = lp + 9
lp2 = InStr(lp, t$, SpecialPageEndLink$)
link$ = Mid$(t$, lp, lp2 - lp)
GetNextPage$ = CleanHTML(link$)
End If
End If
End Function
Sub HTMLtoText (inFile$, outFile$)
Open inFile$ For Binary As #1
t$ = Space$(LOF(1)): Get #1, 1, t$
Close
start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))
finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)
Open outFile$ For Output As #1
Do
a$ = Left$(t$, 8)
If a$ = "<script>" Then
i = InStr(t$, "</script>")
t$ = Mid$(t$, i + 9)
Else
a$ = Left$(t$, 1)
Select Case a$
Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces
Case "<": 'look for a leading <
i = InStr(t$, ">")
If i = 0 Then Print #1, CleanHTML(t$): Exit Do
skip$ = Left$(t$, 3)
Select Case skip$
Case "<br", "</p", "</l", "</d": Print #1, ""
End Select
t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets
Case Else
i = InStr(t$, "<")
If i Then
Print #1, CleanHTML(Left$(t$, i - 1));
t$ = Mid$(t$, i)
Else
Print #1, CleanHTML(t$)
Exit Do
End If
End Select
End If
Loop
Close
End Sub
I may have overlooked a few small tweaks so that this will *perfectly* match the IDE format, but I don't think those will be very hard to sort out and adjust for, with the source as it stands now.
Below is basically an image of how an IDE formatted help page looks like, for those interested: