By using this site you agree to the use of cookies by Brugbart and our partners.

Learn more

Google Scraper Tool

This AutoIt Script can be used to scrape other websites. For now using the Google search engine as an example.

Edited: 2016-03-28 12:51

AutoIt Logo

This script will automatically scrape the results from Google, and save it into a HTML file as an ordered list. The script can be easily modified to work for other sites. If you want to scrape another search engine then simply modify the URL system.

The script has a build-in GUI allowing you to select the number of pages to scrape.

Note. This could be considered a blackhat tool. Abusing this tool could get your IP banned on certain websites. To avoid this problem, it may help to change your $sleeptime value, and randomize it to avoid basic detection methods.

To find more information on AutoIt, you may also want to read the AutoIt Tutorial here on Brugbart.

#cs

 AutoIt Version: 3.3.6.1

 Script Function:
    This Script can be used to scrape Google for URLs

    Normal use should not give you any problems, but it should be mentioned that abusing this is likely to get you banned.

 Version: 2.1

#ce

#include <GUIConstantsEx.au3>
#include <WindowsConstants.au3>
#include <StaticConstants.au3>


GUICreate("Google Scraper Tool", 400, 200, -1, -1, $WS_SIZEBOX + $WS_SYSMENU + $WS_MINIMIZEBOX + $WS_MAXIMIZEBOX)
GUISetIcon("Scrape.ico")
Opt("GUICoordMode", 0)


$label = GUICtrlCreateLabel("Type in your search query and hit the Scrape button.", 10, 10, 300, 25)

$query = GUICtrlCreateInput("", 10, 35, 300, 20)
$okbutton = GUICtrlCreateButton("Scrape", 305, -3, 60, 25)
$MaxPinput = GUICtrlCreateInput("1", 10, 60, 50, 20)
$updown = GUICtrlCreateUpdown($MaxPinput)
$label2 = GUICtrlCreateLabel("Pages:", -55, 0, 40, 25, $SS_RIGHT)
$TestURLs = GUICtrlCreateCheckbox("Check URLs", -270, -29, 100, 20)
$ChkUrlKeywords = GUICtrlCreateCheckbox("And Check for Keywords?", 105, 0, 150, 20)



GUICtrlSetResizing($label, $GUI_DOCKTOP + $GUI_DOCKVCENTER + $GUI_DOCKHEIGHT)
GUICtrlSetResizing($label2, $GUI_DOCKTOP + $GUI_DOCKHEIGHT)
GUICtrlSetResizing($query, $GUI_DOCKTOP + $GUI_DOCKVCENTER + $GUI_DOCKHEIGHT)
GUICtrlSetResizing($okbutton, $GUI_DOCKTOP + $GUI_DOCKVCENTER + $GUI_DOCKHEIGHT)
GUICtrlSetResizing($MaxPinput, $GUI_DOCKTOP + $GUI_DOCKHEIGHT)
GUICtrlSetResizing($TestURLs, $GUI_DOCKTOP + $GUI_DOCKHEIGHT + $GUI_DOCKVCENTER)
GUICtrlSetResizing($ChkUrlKeywords, $GUI_DOCKTOP + $GUI_DOCKHEIGHT + $GUI_DOCKVCENTER)
GUIRegisterMsg($WM_GETMINMAXINFO, "MY_WM_GETMINMAXINFO")

$filemenu = GUICtrlCreateMenu("File")
$helpmenu = GUICtrlCreateMenu("Help")

; Menu Items
$exititem = GUICtrlCreateMenuItem("Exit", $filemenu)
$helpitem = GUICtrlCreateMenuItem("About Google Scraper Tool", $helpmenu)

GUISetState(@SW_SHOW)

; Error object
Global $oError = ObjEvent("AutoIt.Error", "_ErrFunc")

While 1
  $msg = GUIGetMsg()

  If $msg = $okbutton Then
      $MaxPages = GUICtrlRead($MaxPinput)
      $string = GUICtrlRead($query)
      $url = ""
      Scrape($string,$url)
      
        ToolTip("Completed first scrape.", 0, 0)
        $sleeptime = 2000
        sleep($sleeptime)
      
      $StartNumber = 10
      $i = 0
      While $i < $MaxPages-1
        $url = "http://www.google.com/search?q=" & $string & "&start=" & $StartNumber
        Scrape($string,$url)
        ToolTip("Completed page scrape " & $i+1 & " of " & $MaxPages, 0, 0)
        $sleeptime = Random(3000, 4000)
        sleep($sleeptime)
        $StartNumber = $StartNumber + 10
        $i = $i + 1
      WEnd
      MsgBox(0, "Google Scraper Tool:", "The Scrape completed, check Results.html")
      ExitLoop
  EndIf
  If $msg = $helpitem Then
      MsgBox(0, "Google Scraper Tool:", "Google Scraper Tool V2")
  EndIf
  If $msg = $exititem Or $msg = $GUI_EVENT_CLOSE Then ExitLoop
WEnd



; Scraper Function
Func Scrape($string, $url)

$oHTTP = ObjCreate("winhttp.winhttprequest.5.1")

If $url = "" Then
  If FileExists("Results.html") Then
    FileDelete("Results.html")
  EndIf
  $oHTTP.Open("GET", "http://www.google.com/search?q=" & $string)
Else
  $oHTTP.Open("GET", $url)
EndIf

$oHTTP.SetRequestHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10 (.NET CLR 4.0.20506)")

;Send Request and Get data
$oHTTP.Send()
$oReceived = $oHTTP.ResponseText


$Results = StringRegExp($oReceived, '<a href="([^"]+)" class=l[^<>]*>.*?</a>', 3)

$file = FileOpen("Results.html", 1)
$html = "<ol>"
for $i = 0 to UBound($Results) - 1
    If IsChecked($TestURLs) = true Then ; If Check URLs was Checked
      If CheckURL($Results[$i]) = true Then ; If URL Returns 200, optionally check for keywords as well.
       $html = $html & '<li><a href="' & $Results[$i] & '">' & $Results[$i] & '</a></li>'
      EndIf
    Else
       $html = $html & '<li><a href="' & $Results[$i] & '">' & $Results[$i] & '</a></li>'
    EndIf
Next
$html = $html & "</ol>"
FileWrite($file, $html)
FileClose($file)
EndFunc

Func CheckURL($url)
  $HandleError = ObjEvent("AutoIt.Error","ErrorHand")
  $UrlCheck = ObjCreate("winhttp.winhttprequest.5.1")
  
  $UrlCheck.SetTimeouts(5000,5000,30000,30000)
  
  $UrlCheck.Open("GET", $url)
  $UrlCheck.Send()
  $HTML_Source = $UrlCheck.ResponseText
  $StatusMSG = $UrlCheck.Status
  if $StatusMSG == 200 then
    If IsChecked($TestURLs) = true Then ; Do we really need to check for Keywords?
      $searchstring = StringRegExpReplace($string, '"', "") ; remove double quotes
      $searchstring = StringRegExpReplace($searchstring, '\s+', " ") ; Replace spaces, line feeds, and carriage returns with a single space
      $searchstring = StringRegExpReplace($searchstring, "’|'", "") ; Apostrophe and singlequote 

      $HTML_Source = StringRegExpReplace($HTML_Source, ' | ', " ") ; Replace non-breaking-spaces with a single normal space
      $HTML_Source = StringRegExpReplace($HTML_Source, '’', "") ; Remove Apostrophe HTML entities
      $HTML_Source = StringRegExpReplace($HTML_Source, '</?[^<>]+>', " ") ; Remove HTML Elements
      $HTML_Source = StringRegExpReplace($HTML_Source, '\s+', " ") ; Replace spaces, line feeds, and carriage returns with a single space
      $resulting = StringInStr($HTML_Source, $searchstring)
      If $resulting > 1 Then
       Return true ; If a Keyword was found at least one time in the Source
      Else
       Return false ; If no keywords where found
      EndIf
    Else
      Return true ; If not we simply return true regardless
    EndIf
  Else
    Return false
  EndIf
EndFunc

Func IsChecked($control)
  Return BitAnd(GUICtrlRead($control),$GUI_CHECKED) = $GUI_CHECKED
EndFunc

Func _ErrFunc()
    ; Nothing
EndFunc


; Prevents resizing of the GUI window below a set size
Func MY_WM_GETMINMAXINFO($hWnd, $Msg, $wParam, $lParam)
    $minmaxinfo = DllStructCreate("int;int;int;int;int;int;int;int;int;int",$lParam)
    DllStructSetData($minmaxinfo,7,400) ; min X
    DllStructSetData($minmaxinfo,8,200) ; min Y
    Return 0
EndFunc