AD Teaching Wiki
  • Comments
  • Immutable Page
  • Menu
    • Navigation
    • RecentChanges
    • FindPage
    • Local Site Map
    • Help
    • HelpContents
    • HelpOnMoinWikiSyntax
    • Display
    • Attachments
    • Info
    • Raw Text
    • Print View
    • Edit
    • Load
    • Save
  • Login

FrontPage

AD Teaching Wiki:
  • attachment:solution-12-1.cpp of SearchEnginesWS0910

Attachment 'solution-12-1.cpp'

Download

   1 #include <iostream>
   2 #include <fstream>
   3 #include <vector>
   4 #include <ext/hash_set>
   5 #include <ext/hash_map>
   6 #include <math.h>
   7 
   8 using namespace std;
   9 using __gnu_cxx::hash_set;
  10 using __gnu_cxx::hash_map;
  11 
  12 #define MIN(a, b) ( (a) > (b) ? (b) : (a) )
  13 
  14 class StringHashFunction
  15 {
  16   public:
  17     size_t operator()(const string& s) const
  18     {
  19       size_t HashValue = 0;
  20       for (size_t i = 0; i < s.length(); i++)
  21         HashValue = 31 * HashValue + s[i];
  22       return HashValue;
  23     }
  24 };
  25 
  26 bool sortOrder(const pair<string, int>& x, const pair<string, int>& y)
  27 {
  28   return x.second > y.second;
  29 }
  30 
  31 void readInput(const string& filename, vector<vector<string> >* texts, vector<int>* realAssignments)
  32 {
  33   std::ifstream inputFile(filename.c_str());
  34   const size_t MAX_LINE_LENGTH = 100 * 1000;
  35   char* buf = new char[MAX_LINE_LENGTH + 1];
  36   size_t lineCount = 0;
  37   while (true)
  38   {
  39     inputFile.getline(buf, MAX_LINE_LENGTH);
  40     if (inputFile.eof()) break;
  41     std::string line = buf;
  42     ++lineCount;
  43     size_t pos = line.find('\t');
  44     if (pos == std::string::npos)
  45     {
  46       cout << "WARNING: line #" << lineCount
  47            << " without TAB, skipping it " << endl;
  48       continue;
  49     }
  50     std::string text = line.substr(pos + 1);
  51     std::string label = line.substr(0, pos);
  52     size_t i = 0;
  53     vector<string> currentText;
  54     while (i < text.size())  // assuming no duplicate words in titles
  55     {
  56       while (i < text.size() && !isalpha(text[i])) ++i;
  57       size_t i0 = i;
  58       while (i < text.size() && isalpha(text[i])) ++i;
  59       if (i > i0) currentText.push_back(text.substr(i0, i - i0));
  60     }
  61     texts->push_back(currentText);
  62     if (label == "SIGGRAPH") realAssignments->push_back(0);
  63     else
  64       if (label == "SIGIR") realAssignments->push_back(1);
  65       else realAssignments->push_back(2);
  66   }
  67   inputFile.close();
  68 }
  69 
  70 int main(int argc, char** argv)
  71 {
  72   int M = 100;
  73   int nofCentroids = 3;
  74   double RSS = 0;
  75   vector<hash_set<string, StringHashFunction> > centroids;
  76   vector<vector<string> > texts;
  77   vector<int> kmeansAssignments;
  78   vector<int> realAssignments;
  79   vector<int> realCounts;
  80   vector<double> jaccardDistances;
  81   vector<hash_map<string, int, StringHashFunction> > hashmaps;
  82   readInput("dblp.txt", &texts, &realAssignments);
  83   jaccardDistances.resize(texts.size());
  84   kmeansAssignments.resize(texts.size());
  85   realCounts.resize(texts.size());
  86   hashmaps.resize(nofCentroids);
  87   for(unsigned int i = 0; i < texts.size(); i++)
  88   {
  89     kmeansAssignments[i] = -1;
  90     jaccardDistances[i] = INT_MAX;
  91     realCounts[realAssignments[i]]++;
  92   }
  93   // get random centroids
  94   srand(time(NULL));
  95   for(int i = 0; i < nofCentroids; i++)
  96   {
  97     hash_set<string, StringHashFunction> currentCentroid;
  98     for(unsigned int j = 0; j < 5; j++)  // number of random titles to start with
  99     {
 100       int r = rand() % texts.size();
 101       while(realAssignments[r] != i) r = rand() % texts.size();
 102       for(unsigned int j = 0; j < texts[r].size(); j++)
 103         currentCentroid.insert(texts[r][j]);
 104     }
 105     centroids.push_back(currentCentroid);
 106   }
 107   // do iterations
 108   int iterations = 0;
 109   bool change = true;
 110   while(iterations < 100 && change)
 111   {
 112     change = false;
 113     for(unsigned int i = 0; i < texts.size(); i++)
 114     {
 115       for(unsigned int j = 0; j < centroids.size(); j++)
 116       {
 117         int intersectionSize = 0;
 118         int unionSize;
 119         for(unsigned int k = 0; k < texts[i].size(); k++)
 120           if (centroids[j].find(texts[i][k]) != centroids[j].end())
 121             intersectionSize++;
 122         unionSize = centroids[j].size() + texts[i].size() - intersectionSize;
 123         double dist = pow(1 - 1.0 * intersectionSize / unionSize, 2.0);
 124         if (dist < jaccardDistances[i])
 125         {
 126           change = true;
 127           jaccardDistances[i] = dist;
 128           kmeansAssignments[i] = j;
 129         }
 130       }
 131     }
 132     RSS = 0;
 133     for(unsigned int i = 0; i < texts.size(); i++)
 134       RSS += jaccardDistances[i]; // pow(1 - jaccardSimilarities[i], 2.0);
 135     // recalculate centroids
 136     for(unsigned int i = 0; i < centroids.size(); i++)
 137       hashmaps[i].clear();
 138     for(unsigned int i = 0; i < texts.size(); i++)
 139       for(unsigned int j = 0; j < texts[i].size(); j++)
 140         hashmaps[kmeansAssignments[i]][texts[i][j]]++;
 141     hash_map<string, int, StringHashFunction>::iterator it;
 142     for(unsigned int i = 0; i < centroids.size(); i++)
 143     {
 144       vector<pair<string, int> > counts;
 145       pair<string, int> tmpPair;
 146       for(it = hashmaps[i].begin(); it != hashmaps[i].end(); it++)
 147       {
 148         tmpPair.first = it->first;
 149         tmpPair.second = it->second;
 150         counts.push_back(tmpPair);
 151       }
 152       sort(counts.begin(), counts.end(), sortOrder);
 153       int n = MIN(M, (int)counts.size());
 154       centroids[i].clear();
 155       for(int j = 0; j < n; j++)
 156         centroids[i].insert(counts[j].first);
 157     }
 158     ++iterations;
 159     cout << "Iteration " << iterations << ". RSS = " << RSS << endl;
 160   }
 161   // find which centroid is which
 162   vector<vector<int> > counts;
 163   vector<int> centroidMap;
 164   vector<int> centroidCounts;
 165   centroidMap.resize(nofCentroids);
 166   counts.resize(nofCentroids);
 167   centroidCounts.resize(nofCentroids);
 168   for(int i = 0; i < nofCentroids; i++)
 169     counts[i].resize(nofCentroids);
 170   for(unsigned int i = 0; i < texts.size(); i++)
 171   {
 172     counts[kmeansAssignments[i]][realAssignments[i]]++;
 173     centroidCounts[kmeansAssignments[i]]++;
 174   }
 175   vector<bool> centroidTaken;
 176   centroidTaken.resize(nofCentroids);
 177   for(int i = 0; i < nofCentroids; i++)
 178   {
 179     int maxCount = 0;
 180     for(int j = 0; j < nofCentroids; j++)
 181     {
 182       if (counts[i][j] > maxCount && !centroidTaken[j])
 183       {
 184         maxCount = counts[i][j];
 185         centroidMap[i] = j;
 186       }
 187     }
 188     centroidTaken[centroidMap[i]] = true;
 189   }
 190   // compute precision and recall
 191   double precision = 0;
 192   double recall = 0;
 193   for(int i = 0; i < nofCentroids; i++)
 194     precision += 1.0 * counts[i][centroidMap[i]] / realCounts[centroidMap[i]];
 195   precision /= nofCentroids;
 196   for(int i = 0; i < nofCentroids; i++)
 197     recall += 1.0 * counts[i][centroidMap[i]] / centroidCounts[i];
 198   recall /= nofCentroids;
 199   cout << "Precision : " << 100 * precision << "%" << endl;
 200   cout << "Recal :     " << 100 * recall << "%" << endl;
 201 }

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2010-02-04 18:41:13, 2.3 KB) [[attachment:HierarchicalClustering.cpp]]
  • [get | view] (2010-01-12 20:07:24, 2.2 KB) [[attachment:Timer.h]]
  • [get | view] (2010-01-24 15:42:29, 443.7 KB) [[attachment:dblp.txt]]
  • [get | view] (2009-10-22 14:29:03, 44.4 KB) [[attachment:exercise-1.pdf]]
  • [get | view] (2010-01-14 16:06:21, 69.4 KB) [[attachment:exercise-10.pdf]]
  • [get | view] (2010-01-21 14:53:18, 62.0 KB) [[attachment:exercise-11.pdf]]
  • [get | view] (2010-01-28 14:22:59, 61.3 KB) [[attachment:exercise-12.pdf]]
  • [get | view] (2010-02-04 15:31:54, 59.0 KB) [[attachment:exercise-13.pdf]]
  • [get | view] (2010-02-15 17:47:13, 50.9 KB) [[attachment:exercise-14.pdf]]
  • [get | view] (2009-10-29 15:03:08, 43.5 KB) [[attachment:exercise-2.pdf]]
  • [get | view] (2009-11-05 15:24:10, 45.8 KB) [[attachment:exercise-3.pdf]]
  • [get | view] (2009-11-12 15:40:55, 55.2 KB) [[attachment:exercise-4.pdf]]
  • [get | view] (2009-11-20 03:16:57, 53.1 KB) [[attachment:exercise-5.pdf]]
  • [get | view] (2010-03-10 16:24:50, 68.6 KB) [[attachment:exercise-6.pdf]]
  • [get | view] (2009-12-03 15:42:36, 43.2 KB) [[attachment:exercise-7.pdf]]
  • [get | view] (2009-12-22 19:37:35, 57.5 KB) [[attachment:exercise-8.pdf]]
  • [get | view] (2010-01-13 23:57:57, 55.4 KB) [[attachment:exercise-9.pdf]]
  • [get | view] (2009-10-23 16:02:29, 93.8 KB) [[attachment:lecture-1.pdf]]
  • [get | view] (2010-01-17 13:46:23, 162.4 KB) [[attachment:lecture-10.pdf]]
  • [get | view] (2010-01-21 22:21:41, 154.9 KB) [[attachment:lecture-11.pdf]]
  • [get | view] (2010-01-29 16:11:40, 231.0 KB) [[attachment:lecture-12.pdf]]
  • [get | view] (2010-02-04 18:34:40, 147.8 KB) [[attachment:lecture-13.pdf]]
  • [get | view] (2010-02-11 19:36:02, 233.2 KB) [[attachment:lecture-14.pdf]]
  • [get | view] (2009-10-30 00:06:50, 198.7 KB) [[attachment:lecture-2.pdf]]
  • [get | view] (2009-11-05 21:15:21, 106.4 KB) [[attachment:lecture-3.pdf]]
  • [get | view] (2009-11-12 15:53:25, 193.3 KB) [[attachment:lecture-4.pdf]]
  • [get | view] (2009-11-19 03:29:30, 168.2 KB) [[attachment:lecture-5.pdf]]
  • [get | view] (2009-11-27 02:32:33, 151.1 KB) [[attachment:lecture-6.pdf]]
  • [get | view] (2009-12-03 21:22:18, 68.0 KB) [[attachment:lecture-7.pdf]]
  • [get | view] (2009-12-10 21:46:26, 156.3 KB) [[attachment:lecture-8.pdf]]
  • [get | view] (2010-01-07 18:55:37, 127.0 KB) [[attachment:lecture-9.pdf]]
  • [get | view] (2010-02-11 19:42:15, 110.5 KB) [[attachment:lecture-projects.pdf]]
  • [get | view] (2009-12-03 21:20:02, 10.0 KB) [[attachment:prefix-search.tar]]
  • [get | view] (2010-03-10 16:18:17, 68.2 KB) [[attachment:solution-10.pdf]]
  • [get | view] (2010-03-06 18:57:52, 53.0 KB) [[attachment:solution-11.pdf]]
  • [get | view] (2010-03-08 13:55:50, 6.3 KB) [[attachment:solution-12-1.cpp]]
  • [get | view] (2010-03-08 14:16:18, 109.0 KB) [[attachment:solution-12.pdf]]
  • [get | view] (2010-03-10 16:38:55, 44.8 KB) [[attachment:solution-6.pdf]]
  • [get | view] (2010-01-19 13:59:48, 80.4 KB) [[attachment:solution-9.pdf]]
  • [get | view] (2010-03-07 12:59:19, 85.9 KB) [[attachment:solution-midterm.pdf]]
  • [get | view] (2009-11-08 15:22:20, 25.1 KB) [[attachment:trec_queries.txt]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.

  • MoinMoin Powered
  • Python Powered
  • GPL licensed
  • Valid HTML 4.01