Wiki Downloader
#4
An upgraded version 2.0 of this -- this now has the option to download readable HTML pages like what you got with the first version of this code, or you can toggle the flag and basically get the same text which is use for the IDE help system and stored in internal/help.

Code: (Select All)
$Console:Only
DefLng A-Z
Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download
Const HomePage$ = "https://qb64phoenix.com"
ReDim Shared PageNames(10000) As String
NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.
'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)
Print PageCount
t# = Timer: t$ = Time$
For i = 1 To PageCount
    Cls
    Print "Downloading... (Started at: "; t$; ")"
    FileName$ = Mid$(PageNames(i), _InStrRev(PageNames(i), "/") + 1)
    FileName2$ = CleanHTML(FileName$)
    Select Case FileName2$
        Case "*": FileName2$ = "Multiply"
        Case "/": FileName2$ = "Divide"
    End Select
    Print i; "of"; PageCount, FileName2$: _Display
    If DownloadMethod = 2 Then 'download as IDE text file
        url$ = "/qb64wiki/index.php?title=" + FileName$ + "&action=edit"
        Download url$, FileName2$ + ".HTML"
        HTMLtoText FileName2$ + ".HTML", FileName2$ + ".txt"
        'Kill FileName2$ + ".HTML"  'I'll leave this for now, for comparison and reference.  Remember to unremark it later.
    Else 'download as HTML file for browser viewing
        Download PageNames(i), FileName2$ + ".HTML"
    End If
Next
_AutoDisplay
Print "FINISHED!!  (Finsihed at: "; Time$; ")" + Chr$(13)
Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)
    FileLeft$ = "Page List("
    FileRight$ = ").txt"
    For i = 1 To NumberOfPageLists
        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$
        Open file$ For Binary As #1
        t$ = Space$(LOF(1)): Get #1, 1, t$
        Close #1
        Do
            p = InStr(p, t$, "<li><a href=") + 13
            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going
            p2 = InStr(p, t$, Chr$(34))
            count = count + 1
            PageNames(count) = Mid$(t$, p, p2 - p)
        Loop
    Next
    CountPages = count
    ReDim _Preserve PageNames(count) As String
End Function

Function DownloadPageLists
    FileLeft$ = "Page List("
    FileRight$ = ").txt"
    FileCount = 1
    CurrentFile$ = ""
    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download
    Do
        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$
        Download url$, file$
        url2$ = GetNextPage$(file$)
        P = InStr(url2$, "from=")
        If P = 0 Then Exit Do
        If Mid$(url2$, P + 5) > CurrentFile$ Then
            CurrentFile$ = Mid$(url2$, P + 5)
            FileCount = FileCount + 1
            url$ = url2$
        Else
            Exit Do
        End If
    Loop
    DownloadPageLists = FileCount
End Function

Function CleanHTML$ (OriginalText$)
    text$ = OriginalText$ 'don't corrupt incoming text
    Type ReplaceList
        original As String
        replacement As String
    End Type
    Dim HTML(255) As ReplaceList 'Expandable HTML replacement system
    HTML(0).original = "&amp;": HTML(0).replacement = "&"
    HTML(1).original = "&lt;": HTML(1).replacement = "<"
    HTML(2).original = "&gt;": HTML(2).replacement = ">"
    HTML(3).original = "&verbar;": HTML(3).replacement = "|"
    HTML(4).original = "&pi;": HTML(4).replacement = Chr$(227)
    HTML(5).original = "&theta;": HTML(5).replacement = Chr$(233)
    HTML(6).original = "&sup1;": HTML(6).replacement = Chr$(252)
    HTML(7).original = "&quot;": HTML(7).replacement = Chr$(34)
    HTML(8).original = "&sup2;": HTML(8).replacement = Chr$(253)
    HTML(9).original = "&nbsp;": HTML(9).replacement = Chr$(255)
    HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"
    HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"
    HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"
    HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"
    HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"
    HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"
    HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"
    For i = 17 To 255
        HTML(i).original = "%" + Hex$(i)
        HTML(i).replacement = Chr$(i)
    Next
    For i = 0 To UBound(HTML)
        Do
            P = InStr(text$, HTML(i).original)
            If P = 0 Then Exit Do
            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))
        Loop
    Next
    CleanHTML$ = text$
End Function

Sub Download (url$, outputFile$)
    url2$ = CleanHTML(url$)
    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)
End Sub

Function GetNextPage$ (currentPage$)
    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"
    SpecialPageLink$ = "<a href="
    SpecialPageEndLink$ = Chr$(34) + " title"
    Open currentPage$ For Binary As #1
    l = LOF(1)
    t$ = Space$(l)
    Get #1, 1, t$
    Close
    sp = InStr(t$, SpecialPageDivClass$)
    If sp Then
        lp = InStr(sp, t$, SpecialPageLink$)
        If lp Then
            lp = lp + 9
            lp2 = InStr(lp, t$, SpecialPageEndLink$)
            link$ = Mid$(t$, lp, lp2 - lp)
            GetNextPage$ = CleanHTML(link$)
        End If
    End If
End Function

Sub HTMLtoText (inFile$, outFile$)
    Open inFile$ For Binary As #1
    t$ = Space$(LOF(1)): Get #1, 1, t$
    Close
    start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))
    finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)
    Open outFile$ For Output As #1
    Do
        a$ = Left$(t$, 8)
        If a$ = "<script>" Then
            i = InStr(t$, "</script>")
            t$ = Mid$(t$, i + 9)
        Else
            a$ = Left$(t$, 1)
            Select Case a$
                Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces
                Case "<": 'look for a leading <
                    i = InStr(t$, ">")
                    If i = 0 Then Print #1, CleanHTML(t$): Exit Do
                    skip$ = Left$(t$, 3)
                    Select Case skip$
                        Case "<br", "</p", "</l", "</d": Print #1, ""
                    End Select
                    t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets
                Case Else
                    i = InStr(t$, "<")
                    If i Then
                        Print #1, CleanHTML(Left$(t$, i - 1));
                        t$ = Mid$(t$, i)
                    Else
                        Print #1, CleanHTML(t$)
                        Exit Do
                    End If
            End Select
        End If
    Loop
    Close
End Sub

I may have overlooked a few small tweaks so that this will *perfectly* match the IDE format, but I don't think those will be very hard to sort out and adjust for, with the source as it stands now. Smile

Below is basically an image of how an IDE formatted help page looks like, for those interested:

[Image: image.png]
Reply


Messages In This Thread
Wiki Downloader - by SMcNeill - 05-24-2022, 11:41 AM
RE: Wiki Downloader - by Statsman1 - 05-24-2022, 01:30 PM
RE: Wiki Downloader - by Coolman - 05-24-2022, 04:21 PM
RE: Wiki Downloader - by SMcNeill - 05-25-2022, 12:09 AM
RE: Wiki Downloader - by zaadstra - 05-27-2022, 06:28 PM
RE: Wiki Downloader - by SMcNeill - 05-31-2022, 02:42 AM
RE: Wiki Downloader - by SMcNeill - 05-31-2022, 04:24 PM



Users browsing this thread: 2 Guest(s)