From 96d340d253774e52a464231140810aba6f350951 Mon Sep 17 00:00:00 2001
From: deva <deva>
Date: Thu, 18 Aug 2011 09:14:13 +0000
Subject: Clean out annoying urls. Don't remove www. unless really nessecary.

---
 utils/modules/sitestats.php | 164 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 22 deletions(-)

diff --git a/utils/modules/sitestats.php b/utils/modules/sitestats.php
index 63db39a..d16e491 100644
--- a/utils/modules/sitestats.php
+++ b/utils/modules/sitestats.php
@@ -4,6 +4,15 @@ global $UTIL_DIR;
 
 include_once($UTIL_DIR . "/convert.php");
 
+class StatEntry {
+  public $time;
+  public $remoteaddr;
+  public $page;
+  public $agent;
+  public $referer;
+  public $uri;
+}
+
 class SiteStats {
 
   public $statsdir;
@@ -47,7 +56,13 @@ class SiteStats {
       }
       echo "</ul>\n";
 
+      $hits = 0;
+      foreach($this->pages as $k => $v) {
+        $hits += $v;
+      }
+
       echo "<p><strong>Pages:</strong></p>\n";
+      echo "<p>".$hits." hits.</p>\n";
       echo "<ul>\n";
       foreach($this->pages as $k => $v) {
         echo "  <li>(".$v.") <a href=\"?page=" . $k . "\">".$k."</a></li>\n";
@@ -66,6 +81,106 @@ class SiteStats {
     }
   }
 
+  private $top;
+  private $entries = array();
+  private $str;
+  function cdata($parser, $cdata)
+  {
+    $this->str .= $cdata;
+  }
+
+  public function startElement($parser, $name, $attribs)
+  {
+
+    if($name == "E") {
+      $this->top = array_push($this->entries, new StatEntry()) - 1;
+      return;
+    }
+
+    $this->str = "";
+  }
+  
+  public function endElement($parser, $name)
+  {
+    if($name == "T") $this->entries[$this->top]->time = $this->str;
+    if($name == "RA") $this->entries[$this->top]->remoteaddr = $this->str;
+    if($name == "P") $this->entries[$this->top]->page = $this->str;
+    if($name == "A") $this->entries[$this->top]->agent = $this->str;
+    if($name == "R") $this->entries[$this->top]->referer = $this->str;
+    if($name == "U") $this->entries[$this->top]->uri = $this->str;
+  }
+
+  private function read($file)
+  {
+    $xml = file_get_contents($file);
+    $xml = preg_replace('/&/', '@', $xml);
+    
+    $xmlhead = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<stats>\n";
+    $xmlfoot = "</stats>\n";
+
+    $parser = xml_parser_create();
+    xml_set_object($parser, $this);
+    xml_set_element_handler($parser, "startElement", "endElement");
+    xml_set_character_data_handler($parser, "cdata");
+
+    if (!xml_parse($parser, $xmlhead, FALSE)) {
+      die(sprintf("XML error: %s at line %d",
+                  xml_error_string(xml_get_error_code($parser)),
+                  xml_get_current_line_number($parser)));
+    }
+    if (!xml_parse($parser, $xml, FALSE)) {
+      die(sprintf("XML error: %s at line %d",
+                  xml_error_string(xml_get_error_code($parser)),
+                  xml_get_current_line_number($parser)));
+    }
+    if (!xml_parse($parser, $xmlfoot, TRUE)) {
+      die(sprintf("XML error: %s at line %d",
+                  xml_error_string(xml_get_error_code($parser)),
+                  xml_get_current_line_number($parser)));
+    }
+
+    xml_parser_free($parser);
+
+    foreach($this->entries as $entry) {
+
+      $skip = false;
+      if($entry->remoteaddr == "87.63.201.210") $skip = true;
+
+      if(stripos($entry->agent, "bot") != FALSE) $skip = true;
+
+      if($skip == true) continue;
+
+      $this->visitors[$entry->remoteaddr]++;
+
+      $url = $entry->referer;
+      $url = preg_replace('/@/', '&', $url);
+
+      // Don't show 'banned' urls.
+      if(strpos($url, "executionroom.com") != FALSE ||
+         strpos($url, "google") != FALSE ||
+         strpos($url, "bing") != FALSE ||
+         strpos($url, "viagra") != FALSE ||
+         strpos($url, "inthesetimes.com") != FALSE ||
+         strpos($url, "prescription") != FALSE ||
+         $url == "") continue;
+
+      // If url exists already without www remove www from this url.
+      if(substr($url, 7, 3) == "www") {
+        $surl = "http://" . substr($url, 11);
+        if($this->referers[$surl]) $url = $surl;
+      }
+      $this->referers[$url]++;
+      
+      if($entry->page != "") $this->pages[$entry->page]++;
+    }
+
+    arsort($this->referers);
+    arsort($this->pages);
+    arsort($this->visitors);
+
+  }
+
+  /*
   private function read($file)
   {
     $xml = file_get_contents($file);
@@ -75,63 +190,68 @@ class SiteStats {
     $dom = new DomDocument;
     $dom->preserveWhiteSpace = TRUE;
     $dom->loadXML($xml);
-    $entries = $dom->getElementsByTagName('entry');
 
-    foreach($entries as $entry) {
+    $entries = $dom->getElementsByTagName('e');
 
+    foreach($entries as $entry) {
       $skip = false;
-      $remoteaddrs = $entry->getElementsByTagName('remoteaddr');
+      $remoteaddrs = $entry->getElementsByTagName('ra');
       foreach($remoteaddrs as $remoteaddr) {
         $addr = $remoteaddr->textContent;
         if($addr == "87.63.201.210") $skip = true;
       }
 
-      $agents = $entry->getElementsByTagName('agent');
+      $agents = $entry->getElementsByTagName('a');
       foreach($agents as $agent) {
         $ag = $agent->textContent;
-        if(stripos($ag, "bot") > 0) $skip = true;
+        if(stripos($ag, "bot") != FALSE) $skip = true;
       }
 
       if($skip == true) continue;
 
       $this->visitors[$addr]++;
 
-      $refs = $entry->getElementsByTagName('referer');
+      $refs = $entry->getElementsByTagName('r');
       foreach($refs as $ref) {
         $url = $ref->textContent;
-        if(strpos($url, "executionroom.com") > 0 || $url == "") {} else $this->referers[$url]++;
+        if(strpos($url, "executionroom.com") != FALSE ||
+           strpos($url, "google") != FALSE ||
+           strpos($url, "bing") != FALSE ||
+           $url == "") {} else {
+          if(substr($url, 7, 3) == "www") $url = "http://" . substr($url, 11);
+          $this->referers[$url]++;
+        }
       }
 
-      $pages = $entry->getElementsByTagName('page');
+      $pages = $entry->getElementsByTagName('p');
       foreach($pages as $page) {
         $pg = $page->textContent;
         if($pg != "") $this->pages[$pg]++;
       }
-      
     }
 
     arsort($this->referers);
     arsort($this->pages);
     arsort($this->visitors);
   }
-
+  */
   public function log($loadtime)
   {
     $now = time();
 
-    $str = "<entry>".
-      "<time>".xmlenc($now)."</time>". // Time
-      "<remoteaddr>".xmlenc($_SERVER['REMOTE_ADDR'])."</remoteaddr>". // remote ip
-      "<remotehost>".xmlenc($_SERVER['REMOTE_HOST'])."</remotehost>". // remote hostname
-      "<loadtime>".xmlenc($loadtime)."</loadtime>". // Loadtime
-      "<page>".xmlenc($GLOBALS['page'])."</page>". // Page
+    $str = "<e>".
+      "<t>".xmlenc($now)."</t>". // Time
+      "<ra>".xmlenc($_SERVER['REMOTE_ADDR'])."</ra>". // remote ip
+      //      "<rh>".xmlenc($_SERVER['REMOTE_HOST'])."</rh>". // remote hostname
+      //      "<l>".xmlenc($loadtime)."</l>". // Loadtime
+      "<p>".xmlenc($GLOBALS['page'])."</p>". // Page
       //      $_SERVER['REMOTE_PORT'] . // current port
       //      $_SERVER['SCRIPT_FILENAME'] . // script name
-      "<agent>".xmlenc($_SERVER['HTTP_USER_AGENT'])."</agent>". // User agent (browser)
-      "<referer>".xmlenc($_SERVER['HTTP_REFERER'])."</referer>". // referer (link)
-      "<uri>".xmlenc($_SERVER['REQUEST_URI'])."</uri>". // URI
+      "<a>".xmlenc($_SERVER['HTTP_USER_AGENT'])."</a>". // User agent (browser)
+      "<r>".xmlenc($_SERVER['HTTP_REFERER'])."</r>". // referer (link)
+      "<u>".xmlenc($_SERVER['REQUEST_URI'])."</u>". // URI
       // GeoIP ??
-      "</entry>\n";
+      "</e>\n";
 
     $file = $this->getFilename($now);
 
@@ -144,8 +264,8 @@ class SiteStats {
 
   private function getFilename($timestamp)
   {
-    $year = "2010";
-    $month = "03";
+    $year = date("Y", $timestamp);
+    $month = date("m", $timestamp);
     $file = $this->statsdir . "/" . $year . "/" . $month . ".xml";
     return $file;
   }
-- 
cgit v1.2.3