Wiki Downloader
#7
And, after a little additional fiddling, I'm now generating 100% QB64 IDE help-formatting style pages.

Code: (Select All)
$Console:Only
DefLng A-Z

Type ODL_Type
    site As String
    handle As Long
    contents As String
    baseURL As String
    path As String
End Type


Dim Shared CRLF As String: CRLF$ = Chr$(13) + Chr$(10)

Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download
Const HomePage$ = "http://qb64phoenix.com"
ReDim Shared PageNames(10000) As String
NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.
'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)
Dim Shared MaxDownLoads As Long:
MaxDownLoads = 20
Dim Shared ODL(MaxDownLoads) As ODL_Type

t# = Timer: t1$ = Time$

For j = 0 To PageCount Step MaxDownLoads
    For i = 1 To MaxDownLoads 'OPEN ALL CONNECTIONS
        If j + i > UBound(PageNames) Then _Continue
        FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)
        url$ = "http://qb64phoenix.com/qb64wiki/" + FileName$
        If DownloadMethod = 2 Then url$ = "http://qb64phoenix.com/qb64wiki/index.php?title=" + FileName$ + "&action=edit"
        validHandle = OpenDownLoad(url$)
        'Print ODL(i).handle,
    Next
    Cls
    Print "Downloading... (Started at: "; t1$; ")"
    Print "Curently on:"; j; " of "; PageCount
    Do
        finished = -1
        For i = 1 To MaxDownLoads 'DOWNLOAD ALL FILES
            If j + i > UBound(PageNames) Then _Continue
            If ODL(i).handle <> 0 Then
                t$ = GetDownloads(i)
                finished = 0
                If t$ <> "" Then
                    FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)
                    PageName2$ = ""
                    For i1 = 1 To Len(FileName$)
                        c = Asc(FileName$, i1)
                        Select Case c
                            Case 32 '                                    '(space)
                                PageName2$ = PageName2$ + "_"
                            Case 34, 38, 42, 47, 58, 60, 62, 63, 92, 124 '("&*/:<>?\|)
                                PageName2$ = PageName2$ + "%" + Hex$(c)
                            Case Else
                                PageName2$ = PageName2$ + Chr$(c)
                        End Select
                    Next

                    f = FreeFile
                    Open PageName2$ + ".HTML" For Output As #f
                    Print #f, t$
                    Close f

                    If DownloadMethod = 2 Then
                        ext$ = Space$(Len(PageName2$))
                        For i1 = 1 To Len(PageName2$)
                            c = Asc(PageName2$, i1)
                            Select Case c
                                Case 65 To 90: Asc(ext$, i1) = 49 'upper = 1
                                Case 97 To 122: Asc(ext$, i1) = 48 'lower = 0
                                Case Else: Asc(ext$, i1) = c 'non-letter = take as is
                            End Select
                        Next
                        wikiSafeName$ = PageName2$ + "_" + ext$
                        HTMLtoText PageName2$ + ".HTML", wikiSafeName$ + ".txt"
                    End If
                End If
            End If
        Next
    Loop Until finished
Next
Print "FINISHED!!  (Finsihed at: "; Time$; ")" + Chr$(13)
Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)
    FileLeft$ = "Page List("
    FileRight$ = ").txt"
    For i = 1 To NumberOfPageLists
        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$
        Open file$ For Binary As #1
        t$ = Space$(LOF(1)): Get #1, 1, t$
        Close #1
        Do
            p = InStr(p, t$, "<li><a href=") + 13
            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going
            p2 = InStr(p, t$, Chr$(34))
            count = count + 1
            PageNames(count) = Mid$(t$, p, p2 - p)
        Loop
    Next
    CountPages = count
    ReDim _Preserve PageNames(count) As String
End Function

Function DownloadPageLists
    FileLeft$ = "Page List("
    FileRight$ = ").txt"
    FileCount = 1
    CurrentFile$ = ""
    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download
    Do
        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$
        Download url$, file$
        url2$ = GetNextPage$(file$)
        P = InStr(url2$, "from=")
        If P = 0 Then Exit Do
        If Mid$(url2$, P + 5) > CurrentFile$ Then
            CurrentFile$ = Mid$(url2$, P + 5)
            FileCount = FileCount + 1
            url$ = url2$
        Else
            Exit Do
        End If
    Loop
    DownloadPageLists = FileCount
    _Delay .25 'give the pages time to write to the drive before attempting to read them
End Function

Function CleanHTML$ (OriginalText$)
    text$ = OriginalText$ 'don't corrupt incoming text
    Type ReplaceList
        original As String
        replacement As String
    End Type
    Dim HTML(255) As ReplaceList 'Expandable HTML replacement system
    HTML(0).original = "&amp;": HTML(0).replacement = "&"
    HTML(1).original = "&lt;": HTML(1).replacement = "<"
    HTML(2).original = "&gt;": HTML(2).replacement = ">"
    HTML(3).original = "&verbar;": HTML(3).replacement = "|"
    HTML(4).original = "&pi;": HTML(4).replacement = Chr$(227)
    HTML(5).original = "&theta;": HTML(5).replacement = Chr$(233)
    HTML(6).original = "&sup1;": HTML(6).replacement = Chr$(252)
    HTML(7).original = "&quot;": HTML(7).replacement = Chr$(34)
    HTML(8).original = "&sup2;": HTML(8).replacement = Chr$(253)
    HTML(9).original = "&nbsp;": HTML(9).replacement = Chr$(255)
    HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"
    HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"
    HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"
    HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"
    HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"
    HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"
    HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"
    For i = 17 To 255
        HTML(i).original = "%" + Hex$(i)
        HTML(i).replacement = Chr$(i)
    Next
    For i = 0 To UBound(HTML)
        Do
            P = InStr(text$, HTML(i).original)
            If P = 0 Then Exit Do
            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))
        Loop
    Next
    CleanHTML$ = text$
End Function

Sub Download (url$, outputFile$)
    url2$ = CleanHTML(url$)
    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)
End Sub

Function GetNextPage$ (currentPage$)
    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"
    SpecialPageLink$ = "<a href="
    SpecialPageEndLink$ = Chr$(34) + " title"
    Open currentPage$ For Binary As #1
    l = LOF(1)
    t$ = Space$(l)
    Get #1, 1, t$
    Close #1
    sp = InStr(t$, SpecialPageDivClass$)
    If sp Then
        lp = InStr(sp, t$, SpecialPageLink$)
        If lp Then
            lp = lp + 9
            lp2 = InStr(lp, t$, SpecialPageEndLink$)
            link$ = Mid$(t$, lp, lp2 - lp)
            GetNextPage$ = CleanHTML(link$)
        End If
    End If
End Function

Sub HTMLtoText (inFile$, outFile$)
    Open inFile$ For Binary As #1
    t$ = Space$(LOF(1)): Get #1, 1, t$
    Close #1
    start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))
    finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)
    Open outFile$ For Output As #1
    If DownloadMethod = 2 Then Print #1, "{{QBDLDATE:" + Date$ + "}}" + Chr$(10) + "{{QBDLTIME:" + Time$ + "}}" + Chr$(10)

    Do
        a$ = Left$(t$, 8)
        If a$ = "<script>" Then
            i = InStr(t$, "</script>")
            t$ = Mid$(t$, i + 9)
        Else
            a$ = Left$(t$, 1)
            Select Case a$
                Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces
                Case "<": 'look for a leading <
                    i = InStr(t$, ">")
                    If i = 0 Then Print #1, CleanHTML(t$): Exit Do
                    skip$ = Left$(t$, 3)
                    Select Case skip$
                        Case "<br", "</p", "</l", "</d": Print #1, ""
                    End Select
                    t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets
                Case Else
                    i = InStr(t$, "<")
                    If i Then
                        Print #1, CleanHTML(Left$(t$, i - 1));
                        t$ = Mid$(t$, i)
                    Else
                        Print #1, CleanHTML(t$)
                        Exit Do
                    End If
            End Select
        End If
    Loop
    Close #1
End Sub



Function OpenDownLoad (site$)
    For i = 1 To MaxDownLoads
        If ODL(i).site = site$ Then
            OpenDownLoad = ODL(i).handle
            Exit Function
        End If
    Next
    For i = 1 To MaxDownLoads
        If ODL(i).handle = 0 Then
            ODL(i).site = site$
            If Left$(UCase$(site$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this
            webpage$ = site$
            If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://
            p = InStr(webpage$, "/")
            If p = 0 Then Exit Function
            baseURL$ = Left$(webpage$, p - 1)
            path$ = Mid$(webpage$, p)
            ODL(i).handle = _OpenClient("TCP/IP:80:" + baseURL$)
            ODL(i).contents = ""
            'base is everything before the first /, path is everything else.
            'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com
            '             and the path would be /qb64wiki/index.php=Main_Page
            Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$
            Put #ODL(i).handle, , Request$
            Exit Function
        End If
    Next
    OpenDownLoad = 0
End Function

Function GetDownloads$ (PassedI)
    i = PassedI
    'Print i, ODL(i).handle
    If ODL(i).handle <> 0 Then
        Get #ODL(i).handle, , t$
        ODL(i).contents = ODL(i).contents + t$
        If InStr(t$, "</html>") Then
            Close ODL(i).handle
            ODL(i).handle = 0
            GetDownloads = ODL(i).contents
        End If
    End If
End Function


Function Download$ (toSite$)
    CRLF$ = Chr$(13) + Chr$(10)
    If Left$(UCase$(toSite$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this
    webpage$ = toSite$
    If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://
    p = InStr(webpage$, "/")
    If p = 0 Then Exit Function
    baseURL$ = Left$(webpage$, p - 1)
    path$ = Mid$(webpage$, p)
    OpenHandle = _OpenClient("TCP/IP:80:" + baseURL$)
    'base is everything before the first /, path is everything else.
    'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com
    '             and the path would be /qb64wiki/index.php=Main_Page
    Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$
    Put #OpenHandle, , Request$
    Do
        Get #OpenHandle, , t$
        tempDownload$ = tempDownload$ + t$
        _Limit 20
    Loop Until InStr(t$, "</html>")
    Close OpenHandle
    Download$ = tempDownload$
End Function



[Image: image.png]

As you can see from the above, I'm using WinMerge to check to see if the files are comparable, and the only difference in what I'm producing here externally, and what QB64 produces internally, is the timestamp.  (And I'm producing those pages in 70 seconds instead of 10 minutes, or so. Big Grin )
Reply


Messages In This Thread
Wiki Downloader - by SMcNeill - 05-24-2022, 11:41 AM
RE: Wiki Downloader - by Statsman1 - 05-24-2022, 01:30 PM
RE: Wiki Downloader - by Coolman - 05-24-2022, 04:21 PM
RE: Wiki Downloader - by SMcNeill - 05-25-2022, 12:09 AM
RE: Wiki Downloader - by zaadstra - 05-27-2022, 06:28 PM
RE: Wiki Downloader - by SMcNeill - 05-31-2022, 02:42 AM
RE: Wiki Downloader - by SMcNeill - 05-31-2022, 04:24 PM



Users browsing this thread: 5 Guest(s)